train <- read.table("../data/rawdata/adult.data.txt", sep = ",", na.strings = "?",
strip.white = T)
test <- read.table("../data/rawdata/adult.test.txt", sep = ",", na.strings = "?",
strip.white = T)
dim(train)
## [1] 32561 15
dim(test)
## [1] 16281 15
colnames(train) <- c("age", "workclass", "fnlwgt", "education", "education-num",
"marital-status", "occupation", "relationship", "race", "sex",
"capital-gain", "capital-loss", "hours-per-week", "native-country", "income")
colnames(test) <- c("age", "workclass", "fnlwgt", "education", "education-num",
"marital-status", "occupation", "relationship", "race", "sex",
"capital-gain", "capital-loss", "hours-per-week", "native-country", "income")
#Find missing values and NAs for training set.
for(i in 1:ncol(train)){
cat("<names of NA rows in", colnames(train)[i], "variable>", "\n")
cat(rownames(train)[is.na(train[, i])], "\n")
cat("Number of NA values: ", length(rownames(train)[is.na(train[, i])]), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain missing values in", colnames(train)[i], "variable>", "\n")
cat(rownames(train[which(train[, i] == ""), ]), "\n")
cat("Number of Missing values : ", length(rownames(train[which(train[, i] == ""), ])), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain ? values in", colnames(train)[i], "variable>", "\n")
cat(rownames(train[which(train[, i] == " ?"), ]), "\n")
cat("Number of ? values : ", length(rownames(train[which(train[, i] == " ?"), ])), "\n")
print("======================================")
print("======================================")
}
## <names of NA rows in age variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable>
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32311 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543
## Number of NA values: 1836
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable>
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5362 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10846 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14773 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20338 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23233 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32305 32311 32315 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543
## Number of NA values: 1843
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable>
## 15 39 52 62 94 246 250 298 394 454 558 713 726 730 778 781 888 956 1027 1037 1116 1153 1159 1200 1225 1253 1327 1349 1392 1555 1558 1582 1594 1677 1712 1739 1819 1901 1991 2016 2100 2105 2182 2372 2513 2514 2519 2550 2573 2588 2592 2640 2718 2736 2776 2795 2910 2927 3024 3108 3132 3165 3167 3188 3201 3233 3248 3257 3462 3485 3496 3533 3580 3637 3835 3857 3859 4007 4157 4173 4198 4245 4302 4327 4397 4406 4463 4511 4579 4600 4640 4657 4659 4672 4773 4787 4828 5082 5181 5186 5202 5235 5310 5348 5375 5402 5451 5541 5648 5664 5684 5710 5824 5842 5855 5964 6006 6060 6130 6177 6187 6243 6320 6361 6365 6377 6396 6534 6677 6738 6845 7046 7073 7081 7097 7154 7167 7177 7254 7285 7328 7346 7399 7476 7616 7635 7689 7851 7862 7863 7903 7965 7991 8146 8161 8208 8226 8283 8357 8366 8478 8872 8904 8916 9016 9041 9238 9367 9419 9504 9538 9560 9581 9617 9625 9740 9786 9800 9850 9867 9986 10012 10063 10183 10185 10219 10289 10344 10354 10404 10409 10575 10635 10648 10675 10763 10778 10783 11148 11188 11222 11285 11301 11424 11447 11478 11596 11615 11653 11660 11984 11989 12005 12083 12115 12173 12261 12281 12316 12330 12363 12471 12561 12644 12656 12691 12696 12717 12749 12831 12900 12960 12974 12997 13089 13199 13202 13282 13306 13500 13604 13692 13748 13769 13818 13821 13827 13828 13898 13914 13919 13972 14044 14086 14103 14196 14235 14247 14341 14369 14411 14460 14563 14578 14583 14585 14593 14858 15024 15037 15137 15153 15162 15198 15220 15445 15476 15529 15595 15610 15614 15673 15679 15693 15735 15793 15864 15932 15933 15954 15989 16037 16080 16109 16142 16143 16232 16261 16267 16329 16382 16418 16440 16489 16501 16636 16648 16839 16863 16976 17022 17108 17194 17202 17275 17379 17453 17482 17483 17624 17648 17895 18066 18234 18278 18366 18413 18439 18460 18556 18586 18616 18673 18678 18907 18910 18983 19038 19047 19056 19170 19246 19257 19300 19317 19327 19347 19352 19415 19491 19533 19627 19677 19710 19728 19769 19785 19788 19947 19998 20204 20285 20334 20359 20465 20481 20500 20532 20633 20639 20658 20659 20717 20748 20848 21063 21109 21127 21135 21196 21227 21265 21383 21394 21532 21542 21557 21669 21723 21819 22003 22069 22107 22231 22242 22265 22318 22352 22430 22475 22541 22562 22615 22640 22678 22743 22772 22789 22791 22862 22908 22982 23033 23116 23174 23237 23285 23435 23441 23467 23471 23566 23638 23688 23705 23730 23785 23798 23893 23916 24214 24458 24466 24573 24593 24607 24663 24696 24751 24833 24891 24892 24924 24961 24981 25047 25106 25113 25236 25242 25276 25297 25314 25343 25360 25459 25479 25492 25505 25550 25575 25620 25630 25842 25871 26008 26198 26222 26235 26272 26297 26333 26364 26378 26406 26447 26461 26570 26617 26662 26763 26801 26901 26923 26941 26980 27020 27069 27134 27142 27300 27306 27377 27384 27670 28019 28045 28108 28125 28195 28196 28197 28221 28336 28344 28432 28483 28501 28506 28590 28619 28629 28689 28706 28836 28842 28913 28933 28938 29030 29034 29099 29105 29213 29256 29324 29358 29378 29402 29441 29524 29593 29681 29683 29739 29778 29787 29889 29982 30011 30106 30111 30171 30231 30275 30277 30303 30330 30370 30583 30639 30657 30671 30701 30774 30822 30903 30923 31090 31129 31337 31360 31388 31397 31469 31556 31638 31642 31702 31797 31945 32091 32170 32214 32233 32255 32308 32414 32450 32470 32493 32511 32526
## Number of NA values: 583
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
#Find missing values and NAs for testing set.
for(i in 1:ncol(test)){
cat("<names of NA rows in", colnames(test)[i], "variable>", "\n")
cat(rownames(test)[is.na(test[, i])], "\n")
cat("Number of NA values: ", length(rownames(test)[is.na(test[, i])]), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain missing values in", colnames(test)[i], "variable>", "\n")
cat(rownames(test[which(test[, i] == ""), ]), "\n")
cat("Number of Missing values : ", length(rownames(test[which(test[, i] == ""), ])), "\n")
print("======================================")
print("======================================")
cat("<names of rows contain ? values in", colnames(test)[i], "variable>", "\n")
cat(rownames(test[which(test[, i] == " ?"), ]), "\n")
cat("Number of ? values : ", length(rownames(test[which(test[, i] == " ?"), ])), "\n")
print("======================================")
print("======================================")
}
## <names of NA rows in age variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable>
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278
## Number of NA values: 963
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable>
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8786 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11608 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13899 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278
## Number of NA values: 966
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable>
## 20 66 84 189 254 306 330 404 421 472 516 649 666 688 844 1009 1039 1164 1334 1365 1406 1616 1644 1801 1822 1823 1832 1941 2061 2096 2107 2161 2227 2264 2305 2318 2324 2350 2477 2489 2552 2585 2613 2630 2697 2703 2775 2886 3061 3075 3122 3160 3222 3440 3460 3485 3508 3672 3678 3730 3762 3786 3854 3867 4187 4409 4540 4545 4608 4643 4649 4697 4728 4748 4764 4911 4923 5053 5126 5149 5152 5171 5181 5420 5469 5497 5648 5662 5717 5732 5829 5837 5944 5973 6034 6048 6054 6180 6206 6208 6234 6372 6403 6518 6587 6762 6776 6798 6801 6863 6871 6876 7017 7047 7060 7167 7206 7232 7288 7355 7443 7598 7601 7677 7708 7721 7750 7817 8029 8044 8078 8161 8183 8265 8369 8378 8433 8600 8622 8634 8700 8774 8849 8938 8976 9057 9145 9180 9200 9240 9244 9254 9263 9297 9335 9340 9354 9358 9415 9436 9497 9552 9567 9581 9626 9635 9699 9740 9874 9957 9983 10048 10151 10157 10202 10208 10267 10334 10346 10356 10364 10409 10475 10476 10509 10711 10739 10842 11130 11314 11348 11390 11407 11610 11686 11733 11749 11762 11784 11889 11946 12371 12386 12398 12415 12436 12456 12506 12577 12579 12607 12626 12648 12725 12780 12797 12911 12990 13171 13241 13254 13293 13311 13362 13547 13550 13575 13614 13693 13721 13746 13760 13764 13792 13926 13931 13934 13971 13980 14005 14029 14030 14072 14189 14203 14225 14263 14334 14373 14407 14446 14547 14585 14611 14652 14732 15006 15015 15084 15091 15099 15185 15234 15321 15350 15397 15421 15481 15594 15685 15712 16044 16091 16266
## Number of NA values: 274
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable>
##
## Number of NA values: 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable>
##
## Number of Missing values : 0
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable>
##
## Number of ? values : 0
## [1] "======================================"
## [1] "======================================"
#Get percentage of missing values
apply(train, 2, function(x) sum(is.na(x)) / length(x)) * 100
## age workclass fnlwgt education education-num
## 0.000000 5.638647 0.000000 0.000000 0.000000
## marital-status occupation relationship race sex
## 0.000000 5.660146 0.000000 0.000000 0.000000
## capital-gain capital-loss hours-per-week native-country income
## 0.000000 0.000000 0.000000 1.790486 0.000000
apply(test, 2, function(x) sum(is.na(x)) / length(x)) * 100
## age workclass fnlwgt education education-num
## 0.000000 5.914870 0.000000 0.000000 0.000000
## marital-status occupation relationship race sex
## 0.000000 5.933296 0.000000 0.000000 0.000000
## capital-gain capital-loss hours-per-week native-country income
## 0.000000 0.000000 0.000000 1.682943 0.000000
#MICE package to see the pattern
md.pattern(train)
## age fnlwgt education education-num marital-status relationship race
## 30162 1 1 1 1 1 1 1
## 7 1 1 1 1 1 1 1
## 556 1 1 1 1 1 1 1
## 1809 1 1 1 1 1 1 1
## 27 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## sex capital-gain capital-loss hours-per-week income native-country
## 30162 1 1 1 1 1 1
## 7 1 1 1 1 1 1
## 556 1 1 1 1 1 0
## 1809 1 1 1 1 1 1
## 27 1 1 1 1 1 0
## 0 0 0 0 0 583
## workclass occupation
## 30162 1 1 0
## 7 1 0 1
## 556 1 1 1
## 1809 0 0 2
## 27 0 0 3
## 1836 1843 4262
plot <- aggr(train, col = c('blue', 'yellow'),
numbers = TRUE, sortVars = TRUE,
labels = names(train), cex.axis = .7,
gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## occupation 0.05660146
## workclass 0.05638647
## native-country 0.01790486
## age 0.00000000
## fnlwgt 0.00000000
## education 0.00000000
## education-num 0.00000000
## marital-status 0.00000000
## relationship 0.00000000
## race 0.00000000
## sex 0.00000000
## capital-gain 0.00000000
## capital-loss 0.00000000
## hours-per-week 0.00000000
## income 0.00000000
md.pattern(test)
## age fnlwgt education education-num marital-status relationship race
## 15060 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 255 1 1 1 1 1 1 1
## 944 1 1 1 1 1 1 1
## 19 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## sex capital-gain capital-loss hours-per-week income native-country
## 15060 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 255 1 1 1 1 1 0
## 944 1 1 1 1 1 1
## 19 1 1 1 1 1 0
## 0 0 0 0 0 274
## workclass occupation
## 15060 1 1 0
## 3 1 0 1
## 255 1 1 1
## 944 0 0 2
## 19 0 0 3
## 963 966 2203
plot <- aggr(test, col = c('blue', 'yellow'),
numbers = TRUE, sortVars = TRUE,
labels = names(test), cex.axis = .7,
gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## occupation 0.05933296
## workclass 0.05914870
## native-country 0.01682943
## age 0.00000000
## fnlwgt 0.00000000
## education 0.00000000
## education-num 0.00000000
## marital-status 0.00000000
## relationship 0.00000000
## race 0.00000000
## sex 0.00000000
## capital-gain 0.00000000
## capital-loss 0.00000000
## hours-per-week 0.00000000
## income 0.00000000
# Hmisc package to impute missing values
# ww <- aregImpute(~ age + workclass + fnlwgt + education + `education-num` + `marital-status` +
# occupation + relationship + race + sex + `capital-gain` + `capital-loss` +
# `hours-per-week` + income,
# data = train, n.impute = 5, group = "income")
#mlr package to impute missing values
# newworkclass <- impute(train[,2], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
#
# newoccupation <- impute(train[,7], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
#
# newcountry <- impute(train[,14], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
#missForest package to impute missing values
# foresting <- missForest(train, maxiter = 5, ntree = 100)
# foresting$OOBerror
# newtrain <- foresting$ximp
# write.csv(newtrain, file = "../data/cleandata/newtrain.csv", col.names = T, row.names = F)
newtrain <- read.csv("../data/cleandata/newtrain.csv", header = T)
dim(newtrain)
## [1] 32561 15
# foresting2 <- missForest(test, maxiter = 5, ntree = 100)
# foresting2$OOBerror
# newtest <- foresting2$ximp
# write.csv(newtest, file = "../data/cleandata/newtest.csv", col.names = T, row.names = F)
newtest <- read.csv("../data/cleandata/newtest.csv", header = T)
dim(newtest)
## [1] 16281 15
#Check whether the data is messed up while imputing missing values
#They should never show 0, as we are supposed to see only missing value has been changed...
#Compare NA with new number in new data set should show NA, not 0.
t <- matrix(0, 1, ncol(train))
for(i in 1:20){
a <- sample.int(nrow(newtrain), 1)
t <- rbind(t, (newtrain[a, ] == train[a, ]))
}
t <- t[-1, ]
t
## age workclass fnlwgt education education.num marital.status
## 4406 1 1 1 1 1 1
## 18590 1 1 1 1 1 1
## 11515 1 1 1 1 1 1
## 2378 1 1 1 1 1 1
## 6735 1 NA 1 1 1 1
## 919 1 1 1 1 1 1
## 7868 1 1 1 1 1 1
## 16347 1 NA 1 1 1 1
## 8588 1 1 1 1 1 1
## 10709 1 1 1 1 1 1
## 16049 1 1 1 1 1 1
## 6272 1 1 1 1 1 1
## 17338 1 1 1 1 1 1
## 27212 1 1 1 1 1 1
## 17508 1 1 1 1 1 1
## 30001 1 1 1 1 1 1
## 14807 1 1 1 1 1 1
## 16931 1 1 1 1 1 1
## 9813 1 1 1 1 1 1
## 29019 1 1 1 1 1 1
## occupation relationship race sex capital.gain capital.loss
## 4406 1 1 1 1 1 1
## 18590 1 1 1 1 1 1
## 11515 1 1 1 1 1 1
## 2378 1 1 1 1 1 1
## 6735 NA 1 1 1 1 1
## 919 1 1 1 1 1 1
## 7868 1 1 1 1 1 1
## 16347 NA 1 1 1 1 1
## 8588 1 1 1 1 1 1
## 10709 1 1 1 1 1 1
## 16049 1 1 1 1 1 1
## 6272 1 1 1 1 1 1
## 17338 1 1 1 1 1 1
## 27212 1 1 1 1 1 1
## 17508 1 1 1 1 1 1
## 30001 1 1 1 1 1 1
## 14807 1 1 1 1 1 1
## 16931 1 1 1 1 1 1
## 9813 1 1 1 1 1 1
## 29019 1 1 1 1 1 1
## hours.per.week native.country income
## 4406 1 NA 1
## 18590 1 1 1
## 11515 1 1 1
## 2378 1 1 1
## 6735 1 1 1
## 919 1 1 1
## 7868 1 1 1
## 16347 1 1 1
## 8588 1 1 1
## 10709 1 1 1
## 16049 1 1 1
## 6272 1 1 1
## 17338 1 1 1
## 27212 1 1 1
## 17508 1 1 1
## 30001 1 1 1
## 14807 1 1 1
## 16931 1 1 1
## 9813 1 1 1
## 29019 1 1 1
t2 <- matrix(0, 1, ncol(test))
for(i in 1:20){
a <- sample.int(nrow(newtest), 1)
t2 <- rbind(t2, (newtest[a, ] == test[a, ]))
}
t2 <- t2[-1, ]
t2
## age workclass fnlwgt education education.num marital.status
## 6297 1 1 1 1 1 1
## 12202 1 1 1 1 1 1
## 11310 1 1 1 1 1 1
## 14712 1 1 1 1 1 1
## 9277 1 1 1 1 1 1
## 3059 1 1 1 1 1 1
## 6364 1 1 1 1 1 1
## 188 1 1 1 1 1 1
## 13893 1 1 1 1 1 1
## 11202 1 NA 1 1 1 1
## 317 1 NA 1 1 1 1
## 11257 1 1 1 1 1 1
## 16083 1 1 1 1 1 1
## 10765 1 1 1 1 1 1
## 10743 1 1 1 1 1 1
## 6194 1 1 1 1 1 1
## 3742 1 1 1 1 1 1
## 592 1 1 1 1 1 1
## 8118 1 1 1 1 1 1
## 15107 1 1 1 1 1 1
## occupation relationship race sex capital.gain capital.loss
## 6297 1 1 1 1 1 1
## 12202 1 1 1 1 1 1
## 11310 1 1 1 1 1 1
## 14712 1 1 1 1 1 1
## 9277 1 1 1 1 1 1
## 3059 1 1 1 1 1 1
## 6364 1 1 1 1 1 1
## 188 1 1 1 1 1 1
## 13893 1 1 1 1 1 1
## 11202 NA 1 1 1 1 1
## 317 NA 1 1 1 1 1
## 11257 1 1 1 1 1 1
## 16083 1 1 1 1 1 1
## 10765 1 1 1 1 1 1
## 10743 1 1 1 1 1 1
## 6194 1 1 1 1 1 1
## 3742 1 1 1 1 1 1
## 592 1 1 1 1 1 1
## 8118 1 1 1 1 1 1
## 15107 1 1 1 1 1 1
## hours.per.week native.country income
## 6297 1 1 1
## 12202 1 1 1
## 11310 1 1 1
## 14712 1 1 1
## 9277 1 1 1
## 3059 1 1 1
## 6364 1 1 1
## 188 1 1 1
## 13893 1 1 1
## 11202 1 1 1
## 317 1 1 1
## 11257 1 1 1
## 16083 1 1 1
## 10765 1 1 1
## 10743 1 1 1
## 6194 1 1 1
## 3742 1 1 1
## 592 1 1 1
## 8118 1 1 1
## 15107 1 1 1
\(\\\)
\(\\\)
#See structure and summaries before removing outliers
str(newtest)
## 'data.frame': 16281 obs. of 15 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 50 40 40 30 30 40 32 40 10 ...
## $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
## $ income : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
## age workclass fnlwgt
## Min. :17.00 Private :11963 Min. : 13492
## 1st Qu.:28.00 Self-emp-not-inc: 1433 1st Qu.: 116736
## Median :37.00 Local-gov : 1090 Median : 177831
## Mean :38.77 State-gov : 710 Mean : 189436
## 3rd Qu.:48.00 Self-emp-inc : 594 3rd Qu.: 238384
## Max. :90.00 Federal-gov : 481 Max. :1490400
## (Other) : 10
## education education.num marital.status
## HS-grad :5283 Min. : 1.00 Divorced :2190
## Some-college:3587 1st Qu.: 9.00 Married-AF-spouse : 14
## Bachelors :2670 Median :10.00 Married-civ-spouse :7403
## Masters : 934 Mean :10.07 Married-spouse-absent: 210
## Assoc-voc : 679 3rd Qu.:12.00 Never-married :5434
## 11th : 637 Max. :16.00 Separated : 505
## (Other) :2491 Widowed : 525
## occupation relationship race
## Prof-specialty :2111 Husband :6523 Amer-Indian-Eskimo: 159
## Craft-repair :2040 Not-in-family :4278 Asian-Pac-Islander: 480
## Exec-managerial:2035 Other-relative: 525 Black : 1561
## Adm-clerical :1967 Own-child :2513 Other : 135
## Sales :1921 Unmarried :1679 White :13946
## Other-service :1825 Wife : 763
## (Other) :4382
## sex capital.gain capital.loss hours.per.week
## Female: 5421 Min. : 0 Min. : 0.0 Min. : 1.00
## Male :10860 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:40.00
## Median : 0 Median : 0.0 Median :40.00
## Mean : 1082 Mean : 87.9 Mean :40.39
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :99999 Max. :3770.0 Max. :99.00
##
## native.country income
## United-States:14892 <=50K.:12435
## Mexico : 311 >50K. : 3846
## Philippines : 111
## Puerto-Rico : 70
## Germany : 69
## Canada : 61
## (Other) : 767
str(newtrain)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
## age workclass fnlwgt
## Min. :17.00 Private :24068 Min. : 12285
## 1st Qu.:28.00 Self-emp-not-inc: 2776 1st Qu.: 117827
## Median :37.00 Local-gov : 2193 Median : 178356
## Mean :38.58 State-gov : 1352 Mean : 189778
## 3rd Qu.:48.00 Self-emp-inc : 1164 3rd Qu.: 237051
## Max. :90.00 Federal-gov : 985 Max. :1484705
## (Other) : 23
## education education.num marital.status
## HS-grad :10501 Min. : 1.00 Divorced : 4443
## Some-college: 7291 1st Qu.: 9.00 Married-AF-spouse : 23
## Bachelors : 5355 Median :10.00 Married-civ-spouse :14976
## Masters : 1723 Mean :10.08 Married-spouse-absent: 418
## Assoc-voc : 1382 3rd Qu.:12.00 Never-married :10683
## 11th : 1175 Max. :16.00 Separated : 1025
## (Other) : 5134 Widowed : 993
## occupation relationship race
## Prof-specialty :4295 Husband :13193 Amer-Indian-Eskimo: 311
## Craft-repair :4162 Not-in-family : 8305 Asian-Pac-Islander: 1039
## Exec-managerial:4129 Other-relative: 981 Black : 3124
## Adm-clerical :3992 Own-child : 5068 Other : 271
## Sales :3715 Unmarried : 3446 White :27816
## Other-service :3696 Wife : 1568
## (Other) :8572
## sex capital.gain capital.loss hours.per.week
## Female:10771 Min. : 0 Min. : 0.0 Min. : 1.00
## Male :21790 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:40.00
## Median : 0 Median : 0.0 Median :40.00
## Mean : 1078 Mean : 87.3 Mean :40.44
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :99999 Max. :4356.0 Max. :99.00
##
## native.country income
## United-States:29675 <=50K:24720
## Mexico : 657 >50K : 7841
## Philippines : 211
## Germany : 137
## Canada : 121
## Puerto-Rico : 114
## (Other) : 1646
#Deal with outliers for training sets
continuouscol <- c(1, 3, 5, 11, 12, 13) #subset continous variables
par(mfrow = c(2, 3))
for(i in continuouscol){
boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i])),
xlab = colnames(newtrain[i]))
}
for(i in continuouscol){
den_acc <- density(newtrain[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtrain[i])))
polygon(den_acc, col = "red", border = "blue")
}
outlierstrain <- list()
for(i in continuouscol){
outliers <- boxplot.stats(newtrain[, i])$out
numbers <- length(outliers)
outlierstrain[[i]] <- list(outliers, numbers)
}
head(outlierstrain)
## [[1]]
## [[1]][[1]]
## [1] 79 90 80 81 90 88 90 90 80 90 81 82 79 81 80 83 90 90 79 81 90 90 80
## [24] 90 90 79 79 84 90 80 90 81 83 84 81 79 85 82 79 80 90 90 90 84 80 90
## [47] 90 79 84 90 79 90 90 90 82 81 90 84 79 81 82 81 80 90 80 84 82 79 90
## [70] 84 90 83 79 81 80 79 80 79 80 90 90 80 90 90 81 83 82 90 90 81 80 80
## [93] 90 79 80 82 85 80 79 90 81 79 80 79 81 82 88 90 82 88 84 83 79 86 90
## [116] 90 82 83 81 79 90 80 81 79 84 84 79 90 80 81 81 81 90 87 90 80 80 82
## [139] 90 90 85 82 81
##
## [[1]][[2]]
## [1] 143
##
##
## [[2]]
## NULL
##
## [[3]]
## [[3]][[1]]
## [1] 544091 507875 446839 432376 494223 428030 483777 633742
## [9] 523910 635913 538583 477983 425161 860348 423158 481060
## [17] 416103 445382 1033222 426017 543162 433665 462440 556660
## [25] 430828 475028 420537 680390 499233 543028 465507 526968
## [33] 767403 431192 520586 445824 416745 444304 441454 421132
## [41] 795830 419721 509350 467108 444554 449257 441620 563883
## [49] 431745 436006 473040 910398 451940 428350 421871 443040
## [57] 420895 496743 429507 418324 538319 508336 445382 483201
## [65] 452205 672412 473547 421065 505119 460046 549430 441591
## [73] 438696 488720 482082 460835 519627 675421 481987 758700
## [81] 509364 432565 490332 466224 446219 423460 509364 656036
## [89] 443508 566117 436253 454508 427686 548510 545483 503012
## [97] 573583 511361 454941 452405 716416 480861 498785 637222
## [105] 430084 423770 417657 446358 457402 664821 462890 598606
## [113] 457237 465326 503923 572751 580248 519006 617021 437994
## [121] 596776 588905 517995 640383 504725 423863 420917 470663
## [129] 611029 437851 495888 549341 421837 746786 550848 510072
## [137] 449432 430471 416129 511331 446559 452640 456399 469705
## [145] 656036 488720 434710 449354 425627 417136 460835 416338
## [153] 424079 423561 688355 587310 628797 421449 424988 443508
## [161] 632613 499249 445758 416164 473133 450580 506329 445168
## [169] 516337 432376 571853 1184622 913447 476573 632593 595000
## [177] 703067 484475 476391 749105 459465 543922 420282 498325
## [185] 447579 420749 482732 437281 427965 505980 549349 496025
## [193] 562558 642830 435022 443546 523095 436770 436493 704108
## [201] 557082 477106 471452 426001 464536 451996 505980 454614
## [209] 473748 506858 434102 454989 537222 595000 454508 577521
## [217] 424012 431426 604506 564135 427781 469907 503675 444089
## [225] 435835 512103 716066 487486 484298 479765 444743 483596
## [233] 525878 423250 538443 493034 434292 496382 432154 528616
## [241] 515025 433491 421223 428350 446358 455995 659273 435604
## [249] 425092 452924 541737 444822 423024 445940 468706 428584
## [257] 972354 459189 498216 608184 444219 433788 586657 1226583
## [265] 664670 447346 504725 427055 561334 499001 791084 917220
## [273] 430084 508548 511289 416577 512992 431745 427862 637080
## [281] 431861 671292 442612 494638 431307 459007 517000 421446
## [289] 548361 648223 522881 433669 461678 416059 473836 745768
## [297] 523067 508891 486332 418176 417419 464945 454508 476653
## [305] 488706 647882 569761 585203 539563 1038553 567788 732569
## [313] 416165 721161 509629 474136 450924 477697 423711 419658
## [321] 553473 496414 421967 453067 466458 421561 483530 560804
## [329] 447079 528616 485496 425528 502316 467799 469921 444134
## [337] 443179 497300 426431 607848 501172 441700 483822 420973
## [345] 514033 470663 472604 487411 558183 416829 430005 426263
## [353] 439608 456236 420779 541282 518030 459248 548580 526528
## [361] 447739 586657 433375 581071 437727 575442 554986 592930
## [369] 632834 423052 504951 484861 449576 496538 459463 505438
## [377] 479482 467108 467108 849857 426562 558944 420054 691903
## [385] 419691 684015 423605 461678 466498 530099 554317 420054
## [393] 450920 427952 695136 698418 464103 526968 450695 548303
## [401] 529216 526164 506436 439919 734193 737315 544686 468713
## [409] 548361 556652 691830 520775 442429 433669 607799 660870
## [417] 440456 471990 483822 423222 500509 487742 498785 423064
## [425] 532379 426895 493862 424855 469602 432555 424468 428271
## [433] 464502 446140 480717 529104 456110 451744 680390 438711
## [441] 483450 419053 857532 454063 1484705 424034 421837 425447
## [449] 456956 434467 755858 523484 436861 654141 469864 424034
## [457] 458549 930948 664366 420629 456236 515629 606111 463667
## [465] 431637 509364 634226 458558 483261 420749 446358 428405
## [473] 451996 423297 568490 447882 450246 456236 448626 1268339
## [481] 467579 455995 698363 617860 615893 427382 565313 591711
## [489] 520231 461337 419554 460408 454915 448337 536725 472070
## [497] 430175 446771 485117 500002 462294 443508 418020 435638
## [505] 420277 511517 438139 462255 1366120 495061 420351 431245
## [513] 434894 441210 419394 593246 449432 473133 440138 462838
## [521] 423222 529223 456618 651396 451951 431861 517036 436361
## [529] 497788 529216 441637 526734 543042 428299 427744 501144
## [537] 417668 631947 489085 436798 443855 438427 437890 540712
## [545] 549174 460437 806552 604537 487085 436341 473748 484024
## [553] 1455435 445382 659504 416745 439263 556688 750972 424884
## [561] 607848 454915 419895 548256 493363 463194 450695 422149
## [569] 552354 469056 435503 561489 455361 578377 509500 889965
## [577] 462180 506329 428499 507086 419732 659558 440129 609935
## [585] 521400 608184 425804 415913 513660 424478 422960 445728
## [593] 467108 615367 557236 562336 427474 493443 443546 430554
## [601] 434097 520078 460408 454934 474617 485117 456618 660461
## [609] 423222 442035 533147 497253 617898 449354 419722 440607
## [617] 442045 450544 953588 425622 609789 598995 421633 609789
## [625] 424719 482732 469697 452283 663394 417668 530454 494784
## [633] 436107 543477 452452 481096 420054 495982 556902 421412
## [641] 432052 418405 732102 548256 476334 709445 463072 469454
## [649] 423616 456604 609789 570821 438176 416356 421561 636017
## [657] 703107 544792 434463 434114 423222 418961 595088 438996
## [665] 607848 433705 462832 476334 527162 470875 416415 456572
## [673] 422836 566049 602513 509060 448026 491000 488541 520033
## [681] 554206 429346 455379 443742 520759 421837 694812 578701
## [689] 422013 462869 456618 549413 598802 511289 464103 462294
## [697] 427422 440417 439919 424494 806316 459548 541343 438839
## [705] 439592 1033222 424468 599629 571017 416577 425199 738812
## [713] 497280 447066 477209 431513 618191 544268 557853 535978
## [721] 668319 423024 491421 682947 469572 574271 456460 478829
## [729] 816750 597843 442274 595461 553405 506329 704108 481987
## [737] 460408 515712 551962 572751 745817 422933 473171 481175
## [745] 433170 476558 420986 447488 446512 497486 433330 496856
## [753] 1161363 435836 424591 425049 441542 419691 433330 444607
## [761] 459342 452808 427474 447555 422718 673764 424494 418405
## [769] 446654 434467 479621 472789 454843 456062 588484 809585
## [777] 493689 445382 482927 503454 574271 462820 478994 434268
## [785] 501671 594187 439779 509462 435469 548664 422813 498079
## [793] 431515 447488 466502 558490 456661 509048 419146 468713
## [801] 653574 706026 511068 427965 452640 475324 470203 513416
## [809] 421561 417941 535978 422249 442274 721712 615367 472580
## [817] 549174 437825 1097453 423222 461715 471452 426836 442131
## [825] 477867 461929 478380 479611 419146 472807 515797 475322
## [833] 510072 570562 491000 419134 423024 473133 1085515 500720
## [841] 421633 511668 455361 521665 478457 548361 591711 518530
## [849] 594187 417668 452406 499197 434430 509866 504871 695411
## [857] 420986 442359 462966 761006 484669 423616 467611 440647
## [865] 506830 574005 478205 604045 465974 415913 605502 589809
## [873] 426467 487347 588003 509629 431426 429897 709798 561334
## [881] 481987 570002 443546 1125613 454915 440706 532845 498328
## [889] 604380 583755 437909 420691 510072 557349 501172 609789
## [897] 476599 424094 557644 706180 425785 606752 417668 673764
## [905] 460214 475324 547886 554206 430035 456236 419740 462832
## [913] 440129 584790 425804 481987 799281 657397 496526 426431
## [921] 440969 487330 444554 512771 466325 440969 512828 422275
## [929] 531055 437666 472166 653574 417605 502837 444304 436798
## [937] 745768 478346 857532 715938 747719 569930 423217 433989
## [945] 475322 585361 452402 425497 502752 492263 543922 766115
## [953] 461337 421561 456922 584259 493034 538822 542265 430283
## [961] 498349 431245 491862 420895 448337 418702 477505 421467
## [969] 469454 749636 433906 437727 668362 449101 981628 470368
## [977] 746432 451059 499935 473625 566537 456367 455553 693066
## [985] 539864 447346 478315 427686 435842 485710 436163 514716
##
## [[3]][[2]]
## [1] 992
##
##
## [[4]]
## NULL
##
## [[5]]
## [[5]][[1]]
## [1] 4 3 4 4 2 4 3 4 2 1 4 4 3 3 3 4 2 2 2 3 3 2 4 4 4 3 4 4 3 3 4 3 2 1
## [35] 4 4 4 4 2 2 3 3 4 3 4 3 4 4 3 2 4 4 4 4 3 4 4 4 4 4 4 2 4 4 4 4 3 3
## [69] 4 3 4 4 4 4 4 4 4 4 3 4 3 4 4 2 2 3 3 4 3 2 4 4 4 3 3 2 2 4 3 4 1 4
## [103] 1 4 4 4 3 3 4 3 4 4 4 2 4 3 4 3 3 3 1 4 4 4 4 4 1 4 4 4 3 3 4 4 4 4
## [137] 4 3 4 4 3 2 4 4 4 1 3 4 4 4 4 2 2 4 4 4 2 4 4 3 4 4 4 4 2 4 4 4 3 4
## [171] 3 3 3 4 2 4 4 2 4 4 4 3 4 4 4 3 4 3 4 3 4 3 4 2 3 3 4 4 3 3 4 2 4 3
## [205] 2 2 4 4 2 2 4 4 2 2 3 3 3 4 3 4 4 4 4 4 1 4 3 4 4 4 4 3 4 4 4 1 4 4
## [239] 4 4 4 4 4 4 1 3 4 1 4 4 2 4 2 4 4 4 3 3 3 4 4 4 4 3 2 2 4 4 3 4 4 2
## [273] 4 1 4 4 4 4 4 4 4 4 3 1 1 1 4 4 4 2 4 3 3 3 4 2 4 4 4 3 2 4 4 4 2 4
## [307] 1 4 4 4 4 3 2 2 4 4 4 3 3 3 2 2 4 3 4 3 4 4 4 4 3 4 3 4 4 3 4 4 4 3
## [341] 4 4 3 3 4 3 4 2 3 2 4 3 2 3 4 4 4 2 4 4 4 4 3 3 4 4 2 4 3 1 3 2 4 3
## [375] 3 4 3 3 4 4 2 4 3 2 3 4 3 4 4 3 3 2 4 4 4 3 4 3 4 1 4 4 2 2 4 3 1 4
## [409] 3 3 4 3 4 4 4 3 3 3 4 3 1 4 2 2 4 3 3 3 2 4 4 4 3 4 4 2 3 4 4 3 3 4
## [443] 3 4 4 4 4 4 4 4 3 2 4 3 4 4 3 2 4 2 4 4 4 3 4 3 4 4 4 2 4 4 3 3 4 3
## [477] 1 3 2 3 2 4 4 4 3 4 2 2 4 2 2 3 4 2 3 4 3 3 4 4 4 3 2 3 3 3 4 4 4 4
## [511] 2 3 4 3 2 3 3 3 4 3 4 3 4 4 4 3 4 3 2 4 4 3 3 4 3 4 3 4 3 3 3 2 3 3
## [545] 4 4 1 4 3 4 3 2 4 2 4 3 3 4 3 3 4 2 4 4 4 2 4 4 4 4 4 4 4 4 4 3 2 4
## [579] 2 4 4 3 4 4 4 4 4 3 3 4 2 4 4 3 1 3 4 4 1 3 4 4 4 4 3 4 2 4 4 4 4 2
## [613] 4 3 4 4 4 4 3 4 4 3 2 3 4 2 4 4 4 3 4 3 4 4 4 4 3 4 3 3 4 2 2 3 4 4
## [647] 3 4 4 3 4 3 3 4 4 4 4 4 4 3 3 4 3 2 1 4 4 3 4 3 4 3 3 4 3 4 2 2 4 4
## [681] 2 4 3 2 4 3 4 2 4 3 2 4 3 4 2 2 3 2 3 4 4 4 4 4 4 4 4 3 4 4 3 4 2 4
## [715] 4 4 4 4 4 4 2 4 4 4 4 3 4 3 4 3 1 4 4 3 2 4 3 3 4 4 3 3 4 4 4 3 2 4
## [749] 4 2 3 4 4 4 4 4 3 4 4 3 4 1 4 1 4 4 4 2 4 3 4 4 2 4 1 3 3 3 4 1 3 4
## [783] 4 3 2 4 2 4 4 3 4 3 4 4 1 4 2 3 3 3 2 4 3 4 4 4 4 2 1 2 4 3 4 4 4 3
## [817] 4 3 3 1 4 3 3 2 4 3 3 2 4 3 4 3 4 4 4 4 3 4 4 4 4 4 4 3 2 4 2 3 3 3
## [851] 4 4 4 4 3 3 4 4 4 3 3 2 4 4 4 4 1 4 2 4 4 4 4 3 4 4 4 2 4 4 4 4 1 4
## [885] 1 4 4 4 4 4 2 4 1 4 1 4 4 4 4 3 4 1 4 4 4 4 3 4 3 3 3 4 3 3 2 3 4 4
## [919] 4 1 4 2 4 4 4 4 3 4 3 4 4 3 1 4 4 4 3 4 2 4 4 3 4 3 4 4 3 2 4 4 4 1
## [953] 4 4 1 4 4 4 4 4 3 2 3 4 3 3 2 3 3 4 4 4 2 4 4 2 4 3 1 4 4 2 4 1 4 4
## [987] 3 3 3 3 3 4 3 4 3 3 2 4 3 4 4 4 4 4 4 3 4 3 3 4 3 4 3 2 4 4 4 3 4 3
## [1021] 4 3 2 2 4 2 4 4 4 4 2 4 2 3 3 2 3 4 1 4 3 3 3 4 3 4 2 4 4 3 3 4 2 3
## [1055] 3 4 3 4 3 3 4 2 3 4 4 3 4 3 4 4 4 4 4 4 4 3 4 4 4 4 3 3 4 2 3 4 3 3
## [1089] 2 2 2 2 4 4 3 2 4 4 4 3 2 2 3 4 3 2 4 2 4 4 3 4 4 4 3 4 4 4 3 3 4 3
## [1123] 3 3 4 3 3 4 2 3 4 4 2 4 2 2 2 4 3 4 4 3 3 2 2 4 2 4 3 3 2 4 3 2 4 3
## [1157] 3 4 4 4 4 4 4 2 1 4 2 2 4 4 2 4 4 1 2 4 4 4 3 3 3 1 4 2 3 4 1 4 4 2
## [1191] 3 2 4 4 1 4 4 4
##
## [[5]][[2]]
## [1] 1198
##
##
## [[6]]
## NULL
fnlwgttrainout <- tail(order(rank(newtrain[, 3])), 15)
fnlout <- c()
for(i in 1:length(fnlwgttrainout)){
fnlout[i] <- newtrain[fnlwgttrainout[i], 3]
}
#head(order(rank(newtrain[,5])))
table(newtrain[, 11])
##
## 0 114 401 594 914 991 1055 1086 1111 1151 1173 1409
## 29849 6 2 34 8 5 25 4 1 8 3 7
## 1424 1455 1471 1506 1639 1797 1831 1848 2009 2036 2050 2062
## 3 1 7 15 1 7 7 6 3 4 5 2
## 2105 2174 2176 2202 2228 2290 2329 2346 2354 2387 2407 2414
## 9 48 23 16 5 5 6 6 11 1 19 8
## 2463 2538 2580 2597 2635 2653 2829 2885 2907 2936 2961 2964
## 11 1 12 20 11 5 31 24 11 3 3 9
## 2977 2993 3103 3137 3273 3325 3411 3418 3432 3456 3464 3471
## 8 2 97 37 6 53 24 5 4 2 23 8
## 3674 3781 3818 3887 3908 3942 4064 4101 4386 4416 4508 4650
## 14 12 7 6 32 14 42 20 70 12 12 41
## 4687 4787 4865 4931 4934 5013 5060 5178 5455 5556 5721 6097
## 3 23 17 1 7 69 1 97 11 5 3 1
## 6360 6418 6497 6514 6723 6767 6849 7298 7430 7443 7688 7896
## 3 9 11 5 2 5 27 246 9 5 284 3
## 7978 8614 9386 9562 10520 10566 10605 11678 13550 14084 14344 15020
## 1 55 22 4 43 6 12 2 27 41 26 5
## 15024 15831 18481 20051 22040 25124 25236 27828 34095 41310 99999
## 347 6 2 37 1 4 11 34 5 2 159
gainout <- tail(order(rank(newtrain[, 11])), 159)
#Outliers removing for training sets.
dim(newtrain)
## [1] 32561 15
newtrain <- newtrain[-gainout, ]
dim(newtrain)
## [1] 32402 15
#Deal with outliers for testing sets
for(i in continuouscol){
boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i])),
xlab = colnames(newtest[i]))
}
for(i in continuouscol){
den_acc <- density(newtest[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtest[i])))
polygon(den_acc, col = "red", border = "blue")
}
outlierstest <- list()
for(i in continuouscol){
outliers <- boxplot.stats(newtest[, i])$out
numbers <- length(outliers)
outlierstest[[i]] <- list(outliers, numbers)
}
head(outlierstest)
## [[1]]
## [[1]][[1]]
## [1] 79 80 90 79 80 81 82 83 81 85 80 90 81 84 81 89 81 83 81 82 80 90 81
## [24] 83 80 90 90 84 80 80 80 81 90 85 90 81 81 80 80 79 81 80 88 87 90 79
## [47] 83 79 80 90 79 79 81 81 90 82 90 87 81 88 80 81 80 81 90 88 89 84 80
## [70] 80 83 79 81
##
## [[1]][[2]]
## [1] 73
##
##
## [[2]]
## NULL
##
## [[3]]
## [[3]][[1]]
## [1] 444554 432824 465326 445382 479296 428420 456736 537222
## [9] 513100 447488 512864 500068 446894 599057 479179 471990
## [17] 457162 455379 542610 479600 448026 437200 652784 573446
## [25] 453233 662460 426589 629900 499971 450770 481987 478373
## [33] 486194 509364 632733 504725 560313 651702 644278 535852
## [41] 445758 452353 475775 455469 522241 427744 473206 427541
## [49] 581128 444725 608881 490871 430151 431245 451019 430336
## [57] 433602 437994 436431 914061 624006 510072 484475 505365
## [65] 593246 714597 816750 491214 446724 552529 454717 425622
## [73] 575172 475322 622192 566066 493732 427437 427320 614113
## [81] 445365 472517 459556 548568 565769 429832 424988 426350
## [89] 789600 424340 447144 864960 497414 471876 723746 427422
## [97] 421837 692831 535869 433624 638116 467936 698039 427812
## [105] 472861 449101 677398 464621 547931 497039 451742 460322
## [113] 666014 474568 452640 765214 445480 761800 460356 1047822
## [121] 436651 544319 617917 450695 429696 443377 522881 437161
## [129] 421010 479296 459189 469005 457070 750972 505365 458609
## [137] 520231 589155 538193 428251 454321 455399 477345 470486
## [145] 437318 588739 449578 486436 588484 449101 528618 806552
## [153] 478354 467936 505168 858091 451327 482082 663291 447554
## [161] 451603 455995 460408 581025 453983 656488 421633 478457
## [169] 422836 557349 421350 498267 442478 421228 655066 426431
## [177] 494371 737315 541755 436198 594521 442656 491000 455995
## [185] 430672 496856 589838 479296 605504 490332 423453 445382
## [193] 558752 448862 429281 772919 884434 495288 488720 444554
## [201] 604045 437940 697806 632271 497788 464484 587310 467759
## [209] 472344 438587 427055 538243 441227 459465 454950 439777
## [217] 1490400 768659 764638 437458 517995 718736 433682 477083
## [225] 442478 547108 474229 498833 882849 453663 443508 498411
## [233] 504423 746660 488459 423883 457357 501671 786418 565313
## [241] 483201 466458 424934 450200 465334 482096 451603 465725
## [249] 502633 473133 477867 435356 478457 653215 437825 576645
## [257] 510643 538099 425502 432480 482211 539019 496743 455379
## [265] 421132 452402 531055 454076 434081 452402 434710 446947
## [273] 472411 594187 685955 442116 435835 430278 548361 606111
## [281] 459192 592029 426263 513977 647591 566066 553588 433325
## [289] 491607 624572 488706 535740 607118 482677 420973 426431
## [297] 580591 449172 438427 557853 446390 487751 469263 478972
## [305] 441949 430930 635913 485944 557805 626493 444134 433580
## [313] 493034 914061 456736 557349 443336 953588 473547 457710
## [321] 471768 558344 421871 430710 481258 590204 679853 421474
## [329] 443809 516701 443546 535762 438321 814850 427812 874728
## [337] 497525 434102 450141 441949 438429 506830 478277 594194
## [345] 445480 452963 498267 538583 602513 589809 421474 507492
## [353] 546118 446647 530099 453686 443377 1117718 427248 461725
## [361] 460259 849067 590941 572285 608441 720428 423311 436361
## [369] 463601 557359 454024 431515 590522 443546 433592 479406
## [377] 430195 421633 428299 484911 478836 513440 744929 534775
## [385] 511231 598995 456592 525848 442359 458168 457453 913447
## [393] 584259 694105 441227 448841 606347 437566 495366 1024535
## [401] 427474 811615 431551 461929 533660 445382 427475 1210504
## [409] 426263 425830 421837 427770 447210 455995 435836 425816
## [417] 490645 513977 553405 497414 742903 431745 553405 504941
## [425] 450141 456665 449376 487770 448026 443858 473449 440934
## [433] 456430 421200 426589 484879 438696 435638 535027 464552
## [441] 443701 438427 513719 439263 425444 454585 428251 618130
## [449] 542762 771836 473133 464552 435266 437161 462964 423605
## [457] 618808 573446 432204 461484 455379 504871 532969 455665
## [465] 425127 449925 427515 607658 422933 430340 440129
##
## [[3]][[2]]
## [1] 471
##
##
## [[4]]
## NULL
##
## [[5]]
## [[5]][[1]]
## [1] 4 4 3 4 4 4 4 4 4 3 2 3 4 4 2 4 4 3 3 2 4 3 3 4 3 3 4 4 4 1 1 4 3 2 4
## [36] 4 2 3 4 4 1 4 1 4 4 4 3 4 4 3 4 3 4 2 4 2 4 4 4 3 4 2 4 4 3 3 1 1 4 3
## [71] 4 2 3 4 3 3 3 4 4 4 4 4 3 3 3 2 2 4 4 4 4 3 3 4 3 3 3 3 1 2 3 3 3 1 4
## [106] 4 4 4 4 4 4 4 2 3 4 4 3 4 4 4 3 3 3 4 4 1 4 4 4 3 4 2 4 2 4 4 4 4 3 3
## [141] 4 4 1 4 3 4 4 4 3 4 4 4 3 3 3 4 2 2 4 2 4 4 4 4 4 4 4 4 4 2 4 4 3 4 1
## [176] 2 3 4 3 2 4 1 4 2 3 3 4 4 4 1 2 2 4 3 4 4 4 4 3 2 4 4 4 4 3 3 3 4 3 4
## [211] 2 4 4 4 3 4 3 2 4 4 3 4 2 2 4 1 2 3 4 2 4 4 4 4 4 2 4 4 4 3 4 3 4 3 4
## [246] 3 4 3 4 3 4 4 4 4 3 3 3 2 3 4 3 4 4 4 3 1 2 2 2 2 3 1 2 3 4 4 4 1 1 2
## [281] 4 4 4 4 2 4 3 4 3 1 3 3 1 3 4 4 4 4 4 4 3 3 3 3 3 3 4 4 4 4 3 4 4 3 2
## [316] 4 4 2 4 4 3 4 3 4 4 4 4 4 2 3 4 4 3 2 4 2 4 4 4 4 2 3 4 4 3 3 4 3 2 3
## [351] 4 2 3 4 4 3 4 4 2 4 4 3 2 4 4 4 2 4 4 4 3 4 3 3 4 2 4 2 3 3 3 4 3 4 3
## [386] 4 1 4 3 4 4 3 4 2 4 2 3 3 4 3 2 1 1 2 3 3 4 3 1 3 3 2 4 3 4 3 3 3 4 3
## [421] 4 4 2 3 3 3 3 1 3 3 2 4 3 4 1 2 3 4 4 4 4 4 4 3 3 2 3 4 4 3 4 2 4 4 4
## [456] 4 4 2 4 2 4 2 4 4 3 4 3 2 4 3 4 4 3 4 4 4 4 4 3 4 4 3 4 3 4 4 3 2 4 2
## [491] 2 4 2 4 3 4 4 3 4 3 4 3 4 1 1 4 3 2 4 4 4 4 3 3 4 4 2 4 4 4 3 4 3 1 4
## [526] 3 3 4 3 4 4 4 4 4 4 4 4 4 3 2 3 4 3 4 4 4 4 4 3 4 4 3 4 3 4 2 2 3 2 3
## [561] 3 3 4 4 4 1 3 3 3 4 4 1 3 4 2 3 3 3 2 3 3 4 4 4 3 4 4 1 4 4 4 4 4 4 4
## [596] 4
##
## [[5]][[2]]
## [1] 596
##
##
## [[6]]
## NULL
table(newtest[, 11])
##
## 0 114 401 594 914 991 1055 1086 1151 1173 1264 1409
## 14958 2 3 18 2 1 12 4 5 2 2 3
## 1424 1455 1471 1506 1731 1797 1831 1848 2036 2062 2105 2174
## 1 3 2 9 1 3 2 3 1 1 6 26
## 2176 2202 2290 2329 2346 2354 2407 2414 2463 2538 2580 2597
## 8 12 5 1 2 10 6 2 4 4 8 11
## 2635 2653 2829 2885 2907 2936 2961 2964 2977 2993 3103 3137
## 3 6 11 6 7 1 1 5 3 1 55 14
## 3273 3325 3411 3418 3456 3464 3471 3674 3781 3818 3887 3908
## 1 28 10 3 4 10 3 8 4 4 2 10
## 3942 4064 4101 4386 4416 4508 4650 4687 4787 4865 4931 4934
## 4 12 9 38 12 11 22 1 12 8 3 3
## 5013 5060 5178 5455 5556 5721 6097 6418 6497 6514 6612 6723
## 48 1 49 7 1 4 1 7 4 5 1 3
## 6767 6849 7262 7298 7430 7443 7688 7896 7978 8614 9386 9562
## 1 15 1 118 6 2 126 1 1 27 9 1
## 10520 10566 10605 11678 13550 14084 14344 15020 15024 15831 20051 25124
## 21 2 7 2 15 8 8 5 166 2 12 2
## 25236 27828 34095 41310 99999
## 3 24 1 1 85
gainout <- tail(order(rank(newtest[, 11])), 85)
#Outliers removing for training sets.
dim(newtest)
## [1] 16281 15
newtest <- newtest[-gainout, ]
dim(newtest)
## [1] 16196 15
#Plots after removing outliers training
for(i in continuouscol){
boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i]), "-outliers removed"),
xlab = colnames(newtrain[i]))
}
for(i in continuouscol){
den_acc <- density(newtrain[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtrain[i]), "-outliers removed"))
polygon(den_acc, col = "red", border = "blue")
}
#Plots after removing outliers testing
for(i in continuouscol){
boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i]), "-outliers removed"),
xlab = colnames(newtest[i]))
}
for(i in continuouscol){
den_acc <- density(newtest[, i], adjust = 1)
plot(den_acc, main = paste("density plot for", colnames(newtest[i]), "-outliers removed"))
polygon(den_acc, col = "red", border = "blue")
}
\(\\\)
\(\\\)
detach("package:plyr", unload=TRUE) #because plyr and dplyr existed together conflicting...
## Warning: 'plyr' namespace cannot be unloaded:
## namespace 'plyr' is imported by 'ggplot2', 'scales', 'reshape2', 'caret', 'pROC' so cannot be unloaded
#Check whether categorical variables can be discretized....
plot(newtrain$workclass)
table(newtrain$workclass)
##
## Federal-gov Local-gov Never-worked Private
## 983 2187 9 23984
## Self-emp-inc Self-emp-not-inc State-gov Without-pay
## 1127 2747 1351 14
newtrain %>% group_by(workclass) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
## workclass n freq
## <fctr> <int> <dbl>
## 1 Federal-gov 983 0.0303376335
## 2 Local-gov 2187 0.0674958336
## 3 Never-worked 9 0.0002777606
## 4 Private 23984 0.7402012221
## 5 Self-emp-inc 1127 0.0347818036
## 6 Self-emp-not-inc 2747 0.0847787174
## 7 State-gov 1351 0.0416949571
## 8 Without-pay 14 0.0004320721
plot(newtest$workclass)
table(newtest$workclass)
##
## Federal-gov Local-gov Never-worked Private
## 480 1089 3 11919
## Self-emp-inc Self-emp-not-inc State-gov Without-pay
## 570 1421 707 7
newtest %>% group_by(workclass) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
## workclass n freq
## <fctr> <int> <dbl>
## 1 Federal-gov 480 0.0296369474
## 2 Local-gov 1089 0.0672388244
## 3 Never-worked 3 0.0001852309
## 4 Private 11919 0.7359224500
## 5 Self-emp-inc 570 0.0351938750
## 6 Self-emp-not-inc 1421 0.0877377130
## 7 State-gov 707 0.0436527538
## 8 Without-pay 7 0.0004322055
plot(newtrain$education)
table(newtrain$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 931 1175 433 168 333
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 646 513 1066 1381 5314
## Doctorate HS-grad Masters Preschool Prof-school
## 401 10478 1705 51 530
## Some-college
## 7277
newtrain %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education n freq
## <fctr> <int> <dbl>
## 1 10th 931 0.028732794
## 2 11th 1175 0.036263194
## 3 12th 433 0.013363373
## 4 1st-4th 168 0.005184865
## 5 5th-6th 333 0.010277143
## 6 7th-8th 646 0.019937041
## 7 9th 513 0.015832356
## 8 Assoc-acdm 1066 0.032899204
## 9 Assoc-voc 1381 0.042620826
## 10 Bachelors 5314 0.164002222
## 11 Doctorate 401 0.012375779
## 12 HS-grad 10478 0.323375100
## 13 Masters 1705 0.052620209
## 14 Preschool 51 0.001573977
## 15 Prof-school 530 0.016357015
## 16 Some-college 7277 0.224584902
plot(newtest$education)
table(newtest$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 456 637 224 79 175
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 309 242 534 677 2648
## Doctorate HS-grad Masters Preschool Prof-school
## 170 5272 922 32 236
## Some-college
## 3583
newtest %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education n freq
## <fctr> <int> <dbl>
## 1 10th 456 0.028155100
## 2 11th 637 0.039330699
## 3 12th 224 0.013830575
## 4 1st-4th 79 0.004877748
## 5 5th-6th 175 0.010805137
## 6 7th-8th 309 0.019078785
## 7 9th 242 0.014941961
## 8 Assoc-acdm 534 0.032971104
## 9 Assoc-voc 677 0.041800445
## 10 Bachelors 2648 0.163497160
## 11 Doctorate 170 0.010496419
## 12 HS-grad 5272 0.325512472
## 13 Masters 922 0.056927636
## 14 Preschool 32 0.001975796
## 15 Prof-school 236 0.014571499
## 16 Some-college 3583 0.221227464
plot(newtrain$marital.status)
table(newtrain$marital.status)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 4432 23 14844
## Married-spouse-absent Never-married Separated
## 417 10671 1023
## Widowed
## 992
newtrain %>% group_by(marital.status) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
## marital.status n freq
## <fctr> <int> <dbl>
## 1 Divorced 4432 0.1367816801
## 2 Married-AF-spouse 23 0.0007098327
## 3 Married-civ-spouse 14844 0.4581198691
## 4 Married-spouse-absent 417 0.0128695760
## 5 Never-married 10671 0.3293315227
## 6 Separated 1023 0.0315721252
## 7 Widowed 992 0.0306153941
plot(newtest$marital.status)
table(newtest$marital.status)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 2181 13 7340
## Married-spouse-absent Never-married Separated
## 210 5425 503
## Widowed
## 524
newtest %>% group_by(marital.status) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
## marital.status n freq
## <fctr> <int> <dbl>
## 1 Divorced 2181 0.1346628797
## 2 Married-AF-spouse 13 0.0008026673
## 3 Married-civ-spouse 7340 0.4531983206
## 4 Married-spouse-absent 210 0.0129661645
## 5 Never-married 5425 0.3349592492
## 6 Separated 503 0.0310570511
## 7 Widowed 524 0.0323536676
plot(newtrain$occupation)
table(newtrain$occupation)
##
## Adm-clerical Armed-Forces Craft-repair Exec-managerial
## 3986 9 4154 4085
## Farming-fishing Handlers-cleaners Machine-op-inspct Other-service
## 1185 1617 2184 3694
## Priv-house-serv Prof-specialty Protective-serv Sales
## 206 4228 734 3690
## Tech-support Transport-moving
## 992 1638
newtrain %>% group_by(occupation) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
## occupation n freq
## <fctr> <int> <dbl>
## 1 Adm-clerical 3986 0.1230170977
## 2 Armed-Forces 9 0.0002777606
## 3 Craft-repair 4154 0.1282019628
## 4 Exec-managerial 4085 0.1260724647
## 5 Farming-fishing 1185 0.0365718166
## 6 Handlers-cleaners 1617 0.0499043269
## 7 Machine-op-inspct 2184 0.0674032467
## 8 Other-service 3694 0.1140053083
## 9 Priv-house-serv 206 0.0063576322
## 10 Prof-specialty 4228 0.1304857725
## 11 Protective-serv 734 0.0226529227
## 12 Sales 3690 0.1138818591
## 13 Tech-support 992 0.0306153941
## 14 Transport-moving 1638 0.0505524350
plot(newtest$occupation)
table(newtest$occupation)
##
## Adm-clerical Armed-Forces Craft-repair Exec-managerial
## 1965 6 2032 2009
## Farming-fishing Handlers-cleaners Machine-op-inspct Other-service
## 576 864 1085 1824
## Priv-house-serv Prof-specialty Protective-serv Sales
## 133 2077 367 1912
## Tech-support Transport-moving
## 548 798
newtest %>% group_by(occupation) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
## occupation n freq
## <fctr> <int> <dbl>
## 1 Adm-clerical 1965 0.1213262534
## 2 Armed-Forces 6 0.0003704618
## 3 Craft-repair 2032 0.1254630773
## 4 Exec-managerial 2009 0.1240429736
## 5 Farming-fishing 576 0.0355643369
## 6 Handlers-cleaners 864 0.0533465053
## 7 Machine-op-inspct 1085 0.0669918498
## 8 Other-service 1824 0.1126204001
## 9 Priv-house-serv 133 0.0082119042
## 10 Prof-specialty 2077 0.1282415411
## 11 Protective-serv 367 0.0226599160
## 12 Sales 1912 0.1180538405
## 13 Tech-support 548 0.0338355149
## 14 Transport-moving 798 0.0492714250
plot(newtrain$relationship)
table(newtrain$relationship)
##
## Husband Not-in-family Other-relative Own-child Unmarried
## 13072 8284 981 5066 3442
## Wife
## 1557
newtrain %>% group_by(relationship) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
## relationship n freq
## <fctr> <int> <dbl>
## 1 Husband 13072 0.40343189
## 2 Not-in-family 8284 0.25566323
## 3 Other-relative 981 0.03027591
## 4 Own-child 5066 0.15634837
## 5 Unmarried 3442 0.10622801
## 6 Wife 1557 0.04805259
plot(newtest$relationship)
table(newtest$relationship)
##
## Husband Not-in-family Other-relative Own-child Unmarried
## 6465 4262 525 2511 1676
## Wife
## 757
newtest %>% group_by(relationship) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
## relationship n freq
## <fctr> <int> <dbl>
## 1 Husband 6465 0.39917264
## 2 Not-in-family 4262 0.26315140
## 3 Other-relative 525 0.03241541
## 4 Own-child 2511 0.15503828
## 5 Unmarried 1676 0.10348234
## 6 Wife 757 0.04673994
plot(newtrain$race)
table(newtrain$race)
##
## Amer-Indian-Eskimo Asian-Pac-Islander Black
## 311 1029 3117
## Other White
## 269 27676
newtrain %>% group_by(race) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
## race n freq
## <fctr> <int> <dbl>
## 1 Amer-Indian-Eskimo 311 0.009598173
## 2 Asian-Pac-Islander 1029 0.031757299
## 3 Black 3117 0.096197766
## 4 Other 269 0.008301957
## 5 White 27676 0.854144806
plot(newtest$race)
table(newtest$race)
##
## Amer-Indian-Eskimo Asian-Pac-Islander Black
## 159 475 1558
## Other White
## 134 13870
newtest %>% group_by(race) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
## race n freq
## <fctr> <int> <dbl>
## 1 Amer-Indian-Eskimo 159 0.009817239
## 2 Asian-Pac-Islander 475 0.029328229
## 3 Black 1558 0.096196592
## 4 Other 134 0.008273648
## 5 White 13870 0.856384292
plot(newtrain$sex)
table(newtrain$sex)
##
## Female Male
## 10749 21653
newtrain %>% group_by(sex) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
## sex n freq
## <fctr> <int> <dbl>
## 1 Female 10749 0.3317388
## 2 Male 21653 0.6682612
plot(newtest$sex)
table(newtest$sex)
##
## Female Male
## 5407 10789
newtest %>% group_by(sex) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
## sex n freq
## <fctr> <int> <dbl>
## 1 Female 5407 0.3338479
## 2 Male 10789 0.6661521
plot(newtrain$native.country)
table(newtrain$native.country)
##
## Cambodia Canada
## 20 120
## China Columbia
## 79 59
## Cuba Dominican-Republic
## 95 70
## Ecuador El-Salvador
## 28 106
## England France
## 90 29
## Germany Greece
## 137 29
## Guatemala Haiti
## 64 44
## Holand-Netherlands Honduras
## 1 13
## Hong Hungary
## 23 13
## India Iran
## 104 43
## Ireland Italy
## 24 74
## Jamaica Japan
## 81 66
## Laos Mexico
## 22 656
## Nicaragua Outlying-US(Guam-USVI-etc)
## 34 14
## Peru Philippines
## 31 210
## Poland Portugal
## 60 37
## Puerto-Rico Scotland
## 114 12
## South Taiwan
## 89 56
## Thailand Trinadad&Tobago
## 19 19
## United-States Vietnam
## 29528 73
## Yugoslavia
## 16
newtrain %>% group_by(native.country) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 41 x 3
## native.country n freq
## <fctr> <int> <dbl>
## 1 Cambodia 20 0.0006172458
## 2 Canada 120 0.0037034751
## 3 China 79 0.0024381211
## 4 Columbia 59 0.0018208753
## 5 Cuba 95 0.0029319178
## 6 Dominican-Republic 70 0.0021603605
## 7 Ecuador 28 0.0008641442
## 8 El-Salvador 106 0.0032714030
## 9 England 90 0.0027776063
## 10 France 29 0.0008950065
## # ... with 31 more rows
plot(newtest$native.country)
table(newtest$native.country)
##
## Cambodia Canada
## 12 61
## China Columbia
## 50 26
## Cuba Dominican-Republic
## 43 34
## Ecuador El-Salvador
## 17 49
## England France
## 38 9
## Germany Greece
## 69 20
## Guatemala Haiti
## 24 31
## Honduras Hong
## 7 10
## Hungary India
## 6 56
## Iran Ireland
## 16 13
## Italy Jamaica
## 32 25
## Japan Laos
## 32 5
## Mexico Nicaragua
## 310 15
## Outlying-US(Guam-USVI-etc) Peru
## 9 15
## Philippines Poland
## 109 27
## Portugal Puerto-Rico
## 30 70
## Scotland South
## 9 37
## Taiwan Thailand
## 17 13
## Trinadad&Tobago United-States
## 8 14813
## Vietnam Yugoslavia
## 22 7
newtest %>% group_by(native.country) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 40 x 3
## native.country n freq
## <fctr> <int> <dbl>
## 1 Cambodia 12 0.0007409237
## 2 Canada 61 0.0037663621
## 3 China 50 0.0030871820
## 4 Columbia 26 0.0016053347
## 5 Cuba 43 0.0026549765
## 6 Dominican-Republic 34 0.0020992838
## 7 Ecuador 17 0.0010496419
## 8 El-Salvador 49 0.0030254384
## 9 England 38 0.0023462583
## 10 France 9 0.0005556928
## # ... with 30 more rows
#Check collinearity issues
newtrain %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education n freq
## <fctr> <int> <dbl>
## 1 10th 931 0.028732794
## 2 11th 1175 0.036263194
## 3 12th 433 0.013363373
## 4 1st-4th 168 0.005184865
## 5 5th-6th 333 0.010277143
## 6 7th-8th 646 0.019937041
## 7 9th 513 0.015832356
## 8 Assoc-acdm 1066 0.032899204
## 9 Assoc-voc 1381 0.042620826
## 10 Bachelors 5314 0.164002222
## 11 Doctorate 401 0.012375779
## 12 HS-grad 10478 0.323375100
## 13 Masters 1705 0.052620209
## 14 Preschool 51 0.001573977
## 15 Prof-school 530 0.016357015
## 16 Some-college 7277 0.224584902
newtrain %>% group_by(education.num) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
## education.num n freq
## <int> <int> <dbl>
## 1 1 51 0.001573977
## 2 2 168 0.005184865
## 3 3 333 0.010277143
## 4 4 646 0.019937041
## 5 5 513 0.015832356
## 6 6 931 0.028732794
## 7 7 1175 0.036263194
## 8 8 433 0.013363373
## 9 9 10478 0.323375100
## 10 10 7277 0.224584902
## 11 11 1381 0.042620826
## 12 12 1066 0.032899204
## 13 13 5314 0.164002222
## 14 14 1705 0.052620209
## 15 15 530 0.016357015
## 16 16 401 0.012375779
newtrain <- newtrain[, -4]
newtest <- newtest[, -4]
\(\\\)
\(\\\)
#Find correlations of the data - for collinearity issue checks
cor(newtest[, c(1, 3, 4, 10, 12)])
## age fnlwgt education.num capital.gain
## age 1.00000000 -0.0759176992 0.01555523 0.1080390077
## fnlwgt -0.07591770 1.0000000000 -0.02926279 -0.0007549241
## education.num 0.01555523 -0.0292627902 1.00000000 0.1417220957
## capital.gain 0.10803901 -0.0007549241 0.14172210 1.0000000000
## hours.per.week 0.07425722 -0.0026773627 0.12954445 0.0833160656
## hours.per.week
## age 0.074257217
## fnlwgt -0.002677363
## education.num 0.129544454
## capital.gain 0.083316066
## hours.per.week 1.000000000
cor(newtrain[, c(1, 3, 4, 10, 12)])
## age fnlwgt education.num capital.gain
## age 1.00000000 -0.076917052 0.03330048 0.116518227
## fnlwgt -0.07691705 1.000000000 -0.04362125 -0.004506565
## education.num 0.03330048 -0.043621248 1.00000000 0.145735884
## capital.gain 0.11651823 -0.004506565 0.14573588 1.000000000
## hours.per.week 0.06774934 -0.019547738 0.14384089 0.082952143
## hours.per.week
## age 0.06774934
## fnlwgt -0.01954774
## education.num 0.14384089
## capital.gain 0.08295214
## hours.per.week 1.00000000
#remove fnlwght variable.
newtrain <- newtrain[, -3]
newtest <- newtest[, -3]
#See structure and summaries after removing outliers
str(newtest)
## 'data.frame': 16196 obs. of 13 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 50 40 40 30 30 40 32 40 10 ...
## $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
## $ income : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
## age workclass education.num
## Min. :17.00 Private :11919 Min. : 1.00
## 1st Qu.:28.00 Self-emp-not-inc: 1421 1st Qu.: 9.00
## Median :37.00 Local-gov : 1089 Median :10.00
## Mean :38.72 State-gov : 707 Mean :10.06
## 3rd Qu.:48.00 Self-emp-inc : 570 3rd Qu.:12.00
## Max. :90.00 Federal-gov : 480 Max. :16.00
## (Other) : 10
## marital.status occupation relationship
## Divorced :2181 Prof-specialty :2077 Husband :6465
## Married-AF-spouse : 13 Craft-repair :2032 Not-in-family :4262
## Married-civ-spouse :7340 Exec-managerial:2009 Other-relative: 525
## Married-spouse-absent: 210 Adm-clerical :1965 Own-child :2511
## Never-married :5425 Sales :1912 Unmarried :1676
## Separated : 503 Other-service :1824 Wife : 757
## Widowed : 524 (Other) :4377
## race sex capital.gain
## Amer-Indian-Eskimo: 159 Female: 5407 Min. : 0.0
## Asian-Pac-Islander: 475 Male :10789 1st Qu.: 0.0
## Black : 1558 Median : 0.0
## Other : 134 Mean : 562.8
## White :13870 3rd Qu.: 0.0
## Max. :41310.0
##
## capital.loss hours.per.week native.country income
## Min. : 0.00 Min. : 1.00 United-States:14813 <=50K.:12435
## 1st Qu.: 0.00 1st Qu.:40.00 Mexico : 310 >50K. : 3761
## Median : 0.00 Median :40.00 Philippines : 109
## Mean : 88.36 Mean :40.33 Puerto-Rico : 70
## 3rd Qu.: 0.00 3rd Qu.:45.00 Germany : 69
## Max. :3770.00 Max. :99.00 Canada : 61
## (Other) : 764
str(newtrain)
## 'data.frame': 32402 obs. of 13 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
## age workclass education.num
## Min. :17.00 Private :23984 Min. : 1.00
## 1st Qu.:28.00 Self-emp-not-inc: 2747 1st Qu.: 9.00
## Median :37.00 Local-gov : 2187 Median :10.00
## Mean :38.54 State-gov : 1351 Mean :10.07
## 3rd Qu.:48.00 Self-emp-inc : 1127 3rd Qu.:12.00
## Max. :90.00 Federal-gov : 983 Max. :16.00
## (Other) : 23
## marital.status occupation
## Divorced : 4432 Prof-specialty :4228
## Married-AF-spouse : 23 Craft-repair :4154
## Married-civ-spouse :14844 Exec-managerial:4085
## Married-spouse-absent: 417 Adm-clerical :3986
## Never-married :10671 Other-service :3694
## Separated : 1023 Sales :3690
## Widowed : 992 (Other) :8565
## relationship race sex
## Husband :13072 Amer-Indian-Eskimo: 311 Female:10749
## Not-in-family : 8284 Asian-Pac-Islander: 1029 Male :21653
## Other-relative: 981 Black : 3117
## Own-child : 5066 Other : 269
## Unmarried : 3442 White :27676
## Wife : 1557
##
## capital.gain capital.loss hours.per.week native.country
## Min. : 0.0 Min. : 0.00 Min. : 1.00 United-States:29528
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.:40.00 Mexico : 656
## Median : 0.0 Median : 0.00 Median :40.00 Philippines : 210
## Mean : 592.2 Mean : 87.73 Mean :40.39 Germany : 137
## 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.:45.00 Canada : 120
## Max. :41310.0 Max. :4356.00 Max. :99.00 Puerto-Rico : 114
## (Other) : 1637
## income
## <=50K:24720
## >50K : 7682
##
##
##
##
##
#Analyzing/checking before discretizing
# table(newtrain[,14])
# table(newtest[,14])
#
# plot(newtrain$education)
# plot(newtrain$occupation)
# plot(newtrain$native.country)
#
# plot(newtest$education)
# plot(newtest$occupation)
# plot(newtest$native.country)
#Discretize training set
# discretetrainage <- discretize(newtrain$age, method = "interval", categories = 10)
# discretetrainfnlwgt <- discretize(newtrain$fnlwgt, method = "interval", categories = 10)
# discretetrainedunum <- discretize(newtrain$education.num, method = "interval", categories = 10)
# discretetraingain <- discretize(newtrain$capital.gain, method = "interval", categories = 10)
# discretetrainloss <- discretize(newtrain$capital.loss, method = "interval", categories = 10)
# discretetrainhours <- discretize(newtrain$hours.per.week, method = "interval", categories = 10)
#Binning
countrydis <- function(vector){
len <- length(vector)
for(i in 1:len){
if(vector[i] == "United-States"){
vector[i] <- vector[i]
}else if(vector[i] == "Mexico"){
vector[i] <- vector[i]
}else if(vector[i] == "Philippines"){
vector[i] <- vector[i]
}else{
vector[i] <- "other_countries"
}
}
return(vector)
}
workdis <- function(vector){
len <- length(vector)
for(i in 1:len){
if(vector[i] == "Federal-gov"){
vector[i] <- vector[i]
}else if(vector[i] == "Local-gov"){
vector[i] <- vector[i]
}else if(vector[i] == "Private"){
vector[i] <- vector[i]
}else if(vector[i] == "Self-emp-inc"){
vector[i] <- vector[i]
}else if(vector[i] == "Self-emp-not-inc"){
vector[i] <- vector[i]
}else if(vector[i] == "State-gov"){
vector[i] <- vector[i]
}else{
vector[i] <- "No-gain"
}
}
return(vector)
}
#discretetraincountry <- as.factor(countrydis(as.character(newtrain$native.country)))
#Discretize testing set
# discretetestage <- discretize(newtest$age, method = "interval", categories = 10)
# discretetestfnlwgt <- discretize(newtest$fnlwgt, method = "interval", categories = 10)
# discretetestedunum <- discretize(newtest$education.num, method = "interval", categories = 10)
# discretetestgain <- discretize(newtest$capital.gain, method = "interval", categories = 10)
# discretetestloss <- discretize(newtest$capital.loss, method = "interval", categories = 10)
# discretetesthours <- discretize(newtest$hours.per.week, method = "interval", categories = 10)
# discretetestcountry <- as.factor(countrydis(as.character(newtest$native.country)))
#Combine training and testing to make the same intervals for discretizing
newtrain$type <- "train"
newtest$type <- "test"
combined <- rbind(newtrain, newtest)
# discreteage <- discretize(combined$age, method = "interval", categories = 10)
# discretefnlwgt <- discretize(combined$fnlwgt, method = "interval", categories = 10)
# discreteedunum <- discretize(combined$education.num, method = "interval", categories = 10)
# discretegain <- discretize(combined$capital.gain, method = "interval", categories = 7) #not enough data
# discreteloss <- discretize(combined$capital.loss, method = "interval", categories = 7) #not enough data
# discretehours <- discretize(combined$hours.per.week, method = "interval", categories = 10)
discretecountry <- as.factor(countrydis(as.character(combined$native.country)))
discreteworkclass <- as.factor(workdis(as.character(combined$workclass)))
# combined$age <- discreteage
# combined$fnlwgt <- discretefnlwgt
# combined$education.num <- discreteedunum
# combined$capital.gain <- discretegain
# combined$capital.loss <- discreteloss
# combined$hours.per.week <- discretehours
combined$native.country <- discretecountry
combined$workclass <- discreteworkclass
dim(combined)
## [1] 48598 14
newtrain2 <- combined[1:sum(combined$type == "train"), -14]
newtest2 <- combined[(sum(combined$type == "train") + 1):nrow(combined), -14]
dim(newtrain2)
## [1] 32402 13
dim(newtest2)
## [1] 16196 13
#plots
par(mfrow = c(2, 2)) #set how many plots on the palete.
for(i in 1:12){
plot(newtrain2[, i], newtrain2[, 13])
}
for(i in 1:12){
plot(newtest2[, i], newtest2[, 13])
}
#Assignining discretized variables
# newtrain2 <- newtrain
# newtest2 <- newtest
# dim(newtrain2)
# dim(newtest2)
#
# newtrain2$age <- discretetrainage
# newtrain2$fnlwgt <- discretetrainfnlwgt
# newtrain2$education.num <- discretetrainedunum
# newtrain2$capital.gain <- discretetraingain
# newtrain2$capital.loss <- discretetrainloss
# newtrain2$hours.per.week <- discretetrainhours
# newtrain2$native.country <- discretetraincountry
#
# newtest2$age <- discretetestage
# newtest2$fnlwgt <- discretetestfnlwgt
# newtest2$education.num <- discretetestedunum
# newtest2$capital.gain <- discretetestgain
# newtest2$capital.loss <- discretetestloss
# newtest2$hours.per.week <- discretetesthours
# newtest2$native.country <- discretetestcountry
#Dummify training set
dumtrainwork <- dummy(newtrain2$workclass)
dumtrainmarry <- dummy(newtrain2$marital.status)
dumtrainoccu <- dummy(newtrain2$occupation)
dumtrainrelation <- dummy(newtrain2$relationship)
dumtrainrace <- dummy(newtrain2$race)
dumtrainsex <- dummy(newtrain2$sex)
dumtraincountry <- dummy(newtrain2$native.country)
#Dummify testing set
dumtestwork <- dummy(newtest2$workclass)
dumtestmarry <- dummy(newtest2$marital.status)
dumtestoccu <- dummy(newtest2$occupation)
dumtestrelation <- dummy(newtest2$relationship)
dumtestrace <- dummy(newtest2$race)
dumtestsex <- dummy(newtest2$sex)
dumtestcountry <- dummy(newtest2$native.country)
#Take out columns
newtrain2 <- newtrain2[, -c(2, 4, 5, 6, 7, 8, 12)]
newtest2 <- newtest2[, -c(2, 4, 5, 6, 7, 8, 12)]
#Assigning dummified variables
newtrain2 <- cbind(newtrain2, dumtrainwork, dumtrainmarry, dumtrainoccu,
dumtrainrelation, dumtrainrace, dumtrainsex, dumtraincountry)
newtrain2[, 45] <- newtrain2$income
newtrain2 <- newtrain2[, -6]
names(newtrain2)[44]<- "income"
dim(newtrain2)
## [1] 32402 44
newtest2 <- cbind(newtest2, dumtestwork, dumtestmarry, dumtestoccu,
dumtestrelation, dumtestrace, dumtestsex, dumtestcountry)
newtest2[, 45] <- newtest2$income
newtest2 <- newtest2[, -6]
names(newtest2)[44]<- "income"
dim(newtest2)
## [1] 16196 44
#fixing...
newtrain2$income <- droplevels(newtrain2$income, c("<=50K.", ">50K."))
newtest2$income <- droplevels(newtest2$income, c("<=50K", ">50K"))
newtest2$income <- as.character(newtest2$income)
newtest2$income <- substr(newtest2$income, 1, nchar(newtest2$income) - 1)
newtest2$income <- as.factor(newtest2$income)
dim(newtrain2)
## [1] 32402 44
dim(newtest2)
## [1] 16196 44
str(newtrain2)
## 'data.frame': 32402 obs. of 44 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ Local-gov : num 0 0 0 0 0 0 0 0 0 0 ...
## $ No-gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : num 0 0 1 1 1 1 1 0 1 1 ...
## $ Self-emp-inc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Self-emp-not-inc : num 0 1 0 0 0 0 0 1 0 0 ...
## $ State-gov : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Married-AF-spouse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Married-civ-spouse : num 0 1 0 1 1 1 0 1 0 1 ...
## $ Married-spouse-absent: num 0 0 0 0 0 0 1 0 0 0 ...
## $ Never-married : num 1 0 0 0 0 0 0 0 1 0 ...
## $ Separated : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed-Forces : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft-repair : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Exec-managerial : num 0 1 0 0 0 1 0 1 0 1 ...
## $ Farming-fishing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Handlers-cleaners : num 0 0 1 1 0 0 0 0 0 0 ...
## $ Machine-op-inspct : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other-service : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Priv-house-serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof-specialty : num 0 0 0 0 1 0 0 0 1 0 ...
## $ Protective-serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sales : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Tech-support : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport-moving : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Not-in-family : num 1 0 1 0 0 0 1 0 1 0 ...
## $ Other-relative : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Own-child : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Unmarried : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Wife : num 0 0 0 0 1 1 0 0 0 0 ...
## $ Asian-Pac-Islander : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : num 0 0 0 1 1 0 1 0 0 0 ...
## $ Other : num 0 0 0 0 0 0 0 0 0 0 ...
## $ White : num 1 1 1 0 0 1 0 1 1 1 ...
## $ Male : num 1 1 1 1 0 0 0 1 0 1 ...
## $ other_countries : num 0 0 0 0 1 0 1 0 0 0 ...
## $ Philippines : num 0 0 0 0 0 0 0 0 0 0 ...
## $ United-States : num 1 1 1 1 0 1 0 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
str(newtest2)
## 'data.frame': 16196 obs. of 44 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ Local-gov : num 0 0 1 0 0 0 0 0 0 0 ...
## $ No-gain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : num 1 1 0 1 1 1 1 0 1 1 ...
## $ Self-emp-inc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Self-emp-not-inc : num 0 0 0 0 0 0 0 1 0 0 ...
## $ State-gov : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Married-AF-spouse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Married-civ-spouse : num 0 1 1 1 0 0 0 1 0 1 ...
## $ Married-spouse-absent: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Never-married : num 1 0 0 0 1 1 1 0 1 0 ...
## $ Separated : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed-Forces : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft-repair : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Exec-managerial : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Farming-fishing : num 0 1 0 0 0 0 0 0 0 0 ...
## $ Handlers-cleaners : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Machine-op-inspct : num 1 0 0 1 0 0 0 0 0 0 ...
## $ Other-service : num 0 0 0 0 0 1 0 0 1 0 ...
## $ Priv-house-serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof-specialty : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Protective-serv : num 0 0 1 0 0 0 0 0 0 0 ...
## $ Sales : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Tech-support : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport-moving : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Not-in-family : num 0 0 0 0 0 1 0 0 0 0 ...
## $ Other-relative : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Own-child : num 1 0 0 0 1 0 0 0 0 0 ...
## $ Unmarried : num 0 0 0 0 0 0 1 0 1 0 ...
## $ Wife : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian-Pac-Islander : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : num 1 0 0 1 0 0 1 0 0 0 ...
## $ Other : num 0 0 0 0 0 0 0 0 0 0 ...
## $ White : num 0 1 1 0 1 1 0 1 1 1 ...
## $ Male : num 1 1 1 1 0 1 1 1 0 1 ...
## $ other_countries : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Philippines : num 0 0 0 0 0 0 0 0 0 0 ...
## $ United-States : num 1 1 1 1 1 1 1 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
## 'data.frame': 32402 obs. of 44 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ Local.gov : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : int 0 0 1 1 1 1 1 0 1 1 ...
## $ Self.emp.inc : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Self.emp.not.inc : int 0 1 0 0 0 0 0 1 0 0 ...
## $ State.gov : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Married.AF.spouse : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Married.civ.spouse : int 0 1 0 1 1 1 0 1 0 1 ...
## $ Married.spouse.absent: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Never.married : int 1 0 0 0 0 0 0 0 1 0 ...
## $ Separated : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed.Forces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft.repair : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Exec.managerial : int 0 1 0 0 0 1 0 1 0 1 ...
## $ Farming.fishing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Handlers.cleaners : int 0 0 1 1 0 0 0 0 0 0 ...
## $ Machine.op.inspct : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Other.service : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Priv.house.serv : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof.specialty : int 0 0 0 0 1 0 0 0 1 0 ...
## $ Protective.serv : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sales : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tech.support : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport.moving : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Not.in.family : int 1 0 1 0 0 0 1 0 1 0 ...
## $ Other.relative : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Own.child : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Unmarried : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Wife : int 0 0 0 0 1 1 0 0 0 0 ...
## $ Asian.Pac.Islander : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : int 0 0 0 1 1 0 1 0 0 0 ...
## $ Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ White : int 1 1 1 0 0 1 0 1 1 1 ...
## $ Male : int 1 1 1 1 0 0 0 1 0 1 ...
## $ other_countries : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Philippines : int 0 0 0 0 0 0 0 0 0 0 ...
## $ United.States : int 1 1 1 1 0 1 0 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
## 'data.frame': 16196 obs. of 44 variables:
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ education.num : int 7 9 12 10 10 6 9 15 10 4 ...
## $ capital.gain : int 0 0 0 7688 0 0 0 3103 0 0 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ Local.gov : int 0 0 1 0 0 0 0 0 0 0 ...
## $ No.gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : int 1 1 0 1 1 1 1 0 1 1 ...
## $ Self.emp.inc : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Self.emp.not.inc : int 0 0 0 0 0 0 0 1 0 0 ...
## $ State.gov : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Married.AF.spouse : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Married.civ.spouse : int 0 1 1 1 0 0 0 1 0 1 ...
## $ Married.spouse.absent: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Never.married : int 1 0 0 0 1 1 1 0 1 0 ...
## $ Separated : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed.Forces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft.repair : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Exec.managerial : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Farming.fishing : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Handlers.cleaners : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Machine.op.inspct : int 1 0 0 1 0 0 0 0 0 0 ...
## $ Other.service : int 0 0 0 0 0 1 0 0 1 0 ...
## $ Priv.house.serv : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof.specialty : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Protective.serv : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Sales : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Tech.support : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport.moving : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Not.in.family : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Other.relative : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Own.child : int 1 0 0 0 1 0 0 0 0 0 ...
## $ Unmarried : int 0 0 0 0 0 0 1 0 1 0 ...
## $ Wife : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian.Pac.Islander : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : int 1 0 0 1 0 0 1 0 0 0 ...
## $ Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ White : int 0 1 1 0 1 1 0 1 1 1 ...
## $ Male : int 1 1 1 1 0 1 1 1 0 1 ...
## $ other_countries : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Philippines : int 0 0 0 0 0 0 0 0 0 0 ...
## $ United.States : int 1 1 1 1 1 1 1 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
set.seed(100)
#Create a baseline Classification tree using gini index criterion using random cp
tree <- rpart(income ~., data = newtrain2, method = "class",
parms = list(split = 'gini'), control = rpart.control(minsplit = 5, cp = 0.0001, maxdepth = 5))
#Visualization of the tree
rpart.plot(tree)
#Pick the optimal tuning parameter
cp <- tree$cptable[which.min(tree$cptable[, "xerror"]), "CP"]
cp #0.0002603489
## [1] 0.0002603489
# this optimal cp is the same as the default cp that we used in rpart function
#Prune the tree using the optimal cp
treepruned <- prune(tree, cp = cp)
#Treepruned object
treepruned
## n= 32402
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 32402 7682 <=50K (0.762915869 0.237084131)
## 2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)
## 4) capital.gain< 7073.5 17274 849 <=50K (0.950850990 0.049149010) *
## 5) capital.gain>=7073.5 284 11 >50K (0.038732394 0.961267606)
## 10) capital.gain>=30961.5 5 0 <=50K (1.000000000 0.000000000) *
## 11) capital.gain< 30961.5 279 6 >50K (0.021505376 0.978494624) *
## 3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)
## 6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)
## 12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)
## 24) education.num< 8.5 1656 167 <=50K (0.899154589 0.100845411) *
## 25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)
## 50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341) *
## 51) capital.loss>=1782.5 335 83 >50K (0.247761194 0.752238806) *
## 13) capital.gain>=5095.5 496 11 >50K (0.022177419 0.977822581) *
## 7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)
## 14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)
## 28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)
## 56) hours.per.week< 31 306 112 <=50K (0.633986928 0.366013072) *
## 57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345) *
## 29) capital.loss>=1782.5 398 13 >50K (0.032663317 0.967336683) *
## 15) capital.gain>=5095.5 581 3 >50K (0.005163511 0.994836489) *
#Information by cp cross-validation results
printcp(treepruned)
##
## Classification tree:
## rpart(formula = income ~ ., data = newtrain2, method = "class",
## parms = list(split = "gini"), control = rpart.control(minsplit = 5,
## cp = 0.0001, maxdepth = 5))
##
## Variables actually used in tree construction:
## [1] capital.gain capital.loss education.num
## [4] hours.per.week Married.civ.spouse
##
## Root node error: 7682/32402 = 0.23708
##
## n= 32402
##
## CP nsplit rel error xerror xstd
## 1 0.12099714 0 1.00000 1.00000 0.0099655
## 2 0.06170268 2 0.75801 0.75801 0.0089967
## 3 0.03410570 3 0.69630 0.69630 0.0086993
## 4 0.01099974 4 0.66220 0.66220 0.0085245
## 5 0.00355810 6 0.64020 0.65217 0.0084717
## 6 0.00065087 9 0.62952 0.64215 0.0084182
## 7 0.00026035 10 0.62887 0.64007 0.0084069
plotcp(treepruned)
#summary information
summary(treepruned, digits = 3)
## Call:
## rpart(formula = income ~ ., data = newtrain2, method = "class",
## parms = list(split = "gini"), control = rpart.control(minsplit = 5,
## cp = 0.0001, maxdepth = 5))
## n= 32402
##
## CP nsplit rel error xerror xstd
## 1 0.120997 0 1.000 1.000 0.00997
## 2 0.061703 2 0.758 0.758 0.00900
## 3 0.034106 3 0.696 0.696 0.00870
## 4 0.011000 4 0.662 0.662 0.00852
## 5 0.003558 6 0.640 0.652 0.00847
## 6 0.000651 9 0.630 0.642 0.00842
## 7 0.000260 10 0.629 0.640 0.00841
##
## Variable importance
## Married.civ.spouse Never.married education.num
## 27 14 12
## capital.gain Not.in.family Male
## 12 10 8
## age Own.child Prof.specialty
## 6 4 3
## capital.loss hours.per.week
## 2 1
##
## Node number 1: 32402 observations, complexity param=0.121
## predicted class=<=50K expected loss=0.237 P(node) =1
## class counts: 24720 7682
## probabilities: 0.763 0.237
## left son=2 (17558 obs) right son=3 (14844 obs)
## Primary splits:
## Married.civ.spouse < 0.5 to the left, improve=2300, (0 missing)
## capital.gain < 5120 to the left, improve=1480, (0 missing)
## education.num < 12.5 to the left, improve=1210, (0 missing)
## Never.married < 0.5 to the right, improve=1180, (0 missing)
## age < 29.5 to the left, improve= 951, (0 missing)
## Surrogate splits:
## Never.married < 0.5 to the right, agree=0.787, adj=0.536, (0 split)
## Not.in.family < 0.5 to the right, agree=0.713, adj=0.373, (0 split)
## Male < 0.5 to the left, agree=0.688, adj=0.320, (0 split)
## age < 33.5 to the left, agree=0.648, adj=0.231, (0 split)
## Own.child < 0.5 to the right, agree=0.609, adj=0.146, (0 split)
##
## Node number 2: 17558 observations, complexity param=0.0341
## predicted class=<=50K expected loss=0.0639 P(node) =0.542
## class counts: 16436 1122
## probabilities: 0.936 0.064
## left son=4 (17274 obs) right son=5 (284 obs)
## Primary splits:
## capital.gain < 7070 to the left, improve=465.0, (0 missing)
## education.num < 12.5 to the left, improve=142.0, (0 missing)
## hours.per.week < 43.5 to the left, improve=106.0, (0 missing)
## age < 28.5 to the left, improve= 68.8, (0 missing)
## capital.loss < 2370 to the left, improve= 59.4, (0 missing)
##
## Node number 3: 14844 observations, complexity param=0.121
## predicted class=<=50K expected loss=0.442 P(node) =0.458
## class counts: 8284 6560
## probabilities: 0.558 0.442
## left son=6 (10475 obs) right son=7 (4369 obs)
## Primary splits:
## education.num < 12.5 to the left, improve=908, (0 missing)
## capital.gain < 5100 to the left, improve=690, (0 missing)
## Exec.managerial < 0.5 to the left, improve=331, (0 missing)
## Prof.specialty < 0.5 to the left, improve=319, (0 missing)
## capital.loss < 1780 to the left, improve=268, (0 missing)
## Surrogate splits:
## Prof.specialty < 0.5 to the left, agree=0.791, adj=0.289, (0 split)
## capital.gain < 7490 to the left, agree=0.717, adj=0.040, (0 split)
## Exec.managerial < 0.5 to the left, agree=0.712, adj=0.021, (0 split)
## capital.loss < 1890 to the left, agree=0.711, adj=0.018, (0 split)
## State.gov < 0.5 to the left, agree=0.707, adj=0.004, (0 split)
##
## Node number 4: 17274 observations
## predicted class=<=50K expected loss=0.0491 P(node) =0.533
## class counts: 16425 849
## probabilities: 0.951 0.049
##
## Node number 5: 284 observations, complexity param=0.000651
## predicted class=>50K expected loss=0.0387 P(node) =0.00876
## class counts: 11 273
## probabilities: 0.039 0.961
## left son=10 (5 obs) right son=11 (279 obs)
## Primary splits:
## capital.gain < 31000 to the right, improve=9.41, (0 missing)
## age < 21 to the left, improve=7.50, (0 missing)
## Handlers.cleaners < 0.5 to the right, improve=3.72, (0 missing)
## hours.per.week < 35.5 to the left, improve=3.04, (0 missing)
## education.num < 10.5 to the left, improve=1.87, (0 missing)
## Surrogate splits:
## age < 21 to the left, agree=0.996, adj=0.8, (0 split)
##
## Node number 6: 10475 observations, complexity param=0.0617
## predicted class=<=50K expected loss=0.329 P(node) =0.323
## class counts: 7029 3446
## probabilities: 0.671 0.329
## left son=12 (9979 obs) right son=13 (496 obs)
## Primary splits:
## capital.gain < 5100 to the left, improve=438, (0 missing)
## education.num < 8.5 to the left, improve=178, (0 missing)
## age < 35.5 to the left, improve=134, (0 missing)
## Exec.managerial < 0.5 to the left, improve=125, (0 missing)
## capital.loss < 1780 to the left, improve=118, (0 missing)
##
## Node number 7: 4369 observations, complexity param=0.00356
## predicted class=>50K expected loss=0.287 P(node) =0.135
## class counts: 1255 3114
## probabilities: 0.287 0.713
## left son=14 (3788 obs) right son=15 (581 obs)
## Primary splits:
## capital.gain < 5100 to the left, improve=107.0, (0 missing)
## capital.loss < 1780 to the left, improve= 56.8, (0 missing)
## hours.per.week < 31 to the left, improve= 54.9, (0 missing)
## age < 28.5 to the left, improve= 40.6, (0 missing)
## education.num < 13.5 to the left, improve= 31.9, (0 missing)
##
## Node number 10: 5 observations
## predicted class=<=50K expected loss=0 P(node) =0.000154
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 11: 279 observations
## predicted class=>50K expected loss=0.0215 P(node) =0.00861
## class counts: 6 273
## probabilities: 0.022 0.978
##
## Node number 12: 9979 observations, complexity param=0.011
## predicted class=<=50K expected loss=0.297 P(node) =0.308
## class counts: 7018 2961
## probabilities: 0.703 0.297
## left son=24 (1656 obs) right son=25 (8323 obs)
## Primary splits:
## education.num < 8.5 to the left, improve=152, (0 missing)
## capital.loss < 1780 to the left, improve=139, (0 missing)
## age < 35.5 to the left, improve=108, (0 missing)
## Exec.managerial < 0.5 to the left, improve=102, (0 missing)
## hours.per.week < 34.5 to the left, improve= 60, (0 missing)
## Surrogate splits:
## age < 17.5 to the left, agree=0.834, adj=0.001, (0 split)
##
## Node number 13: 496 observations
## predicted class=>50K expected loss=0.0222 P(node) =0.0153
## class counts: 11 485
## probabilities: 0.022 0.978
##
## Node number 14: 3788 observations, complexity param=0.00356
## predicted class=>50K expected loss=0.331 P(node) =0.117
## class counts: 1252 2536
## probabilities: 0.331 0.669
## left son=28 (3390 obs) right son=29 (398 obs)
## Primary splits:
## capital.loss < 1780 to the left, improve=78.9, (0 missing)
## hours.per.week < 31 to the left, improve=52.3, (0 missing)
## age < 28.5 to the left, improve=33.9, (0 missing)
## education.num < 13.5 to the left, improve=30.7, (0 missing)
## capital.gain < 3120 to the right, improve=30.0, (0 missing)
##
## Node number 15: 581 observations
## predicted class=>50K expected loss=0.00516 P(node) =0.0179
## class counts: 3 578
## probabilities: 0.005 0.995
##
## Node number 24: 1656 observations
## predicted class=<=50K expected loss=0.101 P(node) =0.0511
## class counts: 1489 167
## probabilities: 0.899 0.101
##
## Node number 25: 8323 observations, complexity param=0.011
## predicted class=<=50K expected loss=0.336 P(node) =0.257
## class counts: 5529 2794
## probabilities: 0.664 0.336
## left son=50 (7988 obs) right son=51 (335 obs)
## Primary splits:
## capital.loss < 1780 to the left, improve=121.0, (0 missing)
## age < 35.5 to the left, improve=121.0, (0 missing)
## Exec.managerial < 0.5 to the left, improve= 77.0, (0 missing)
## education.num < 9.5 to the left, improve= 61.8, (0 missing)
## hours.per.week < 34.5 to the left, improve= 57.2, (0 missing)
##
## Node number 28: 3390 observations, complexity param=0.00356
## predicted class=>50K expected loss=0.365 P(node) =0.105
## class counts: 1239 2151
## probabilities: 0.365 0.635
## left son=56 (306 obs) right son=57 (3084 obs)
## Primary splits:
## hours.per.week < 31 to the left, improve=48.5, (0 missing)
## age < 28.5 to the left, improve=29.9, (0 missing)
## capital.gain < 3120 to the right, improve=25.8, (0 missing)
## Exec.managerial < 0.5 to the left, improve=24.8, (0 missing)
## education.num < 13.5 to the left, improve=24.1, (0 missing)
## Surrogate splits:
## age < 66.5 to the right, agree=0.917, adj=0.075, (0 split)
##
## Node number 29: 398 observations
## predicted class=>50K expected loss=0.0327 P(node) =0.0123
## class counts: 13 385
## probabilities: 0.033 0.967
##
## Node number 50: 7988 observations
## predicted class=<=50K expected loss=0.318 P(node) =0.247
## class counts: 5446 2542
## probabilities: 0.682 0.318
##
## Node number 51: 335 observations
## predicted class=>50K expected loss=0.248 P(node) =0.0103
## class counts: 83 252
## probabilities: 0.248 0.752
##
## Node number 56: 306 observations
## predicted class=<=50K expected loss=0.366 P(node) =0.00944
## class counts: 194 112
## probabilities: 0.634 0.366
##
## Node number 57: 3084 observations
## predicted class=>50K expected loss=0.339 P(node) =0.0952
## class counts: 1045 2039
## probabilities: 0.339 0.661
#Variable importance
varimp <- varImp(treepruned)
varimp
## Overall
## age 1494.555990
## capital.gain 3240.846180
## capital.loss 841.558520
## education.num 2742.553467
## Exec.managerial 660.776425
## Handlers.cleaners 3.722355
## hours.per.week 382.023696
## Married.civ.spouse 2298.950331
## Never.married 1175.483228
## Prof.specialty 318.529199
## Local.gov 0.000000
## No.gain 0.000000
## Private 0.000000
## Self.emp.inc 0.000000
## Self.emp.not.inc 0.000000
## State.gov 0.000000
## Married.AF.spouse 0.000000
## Married.spouse.absent 0.000000
## Separated 0.000000
## Widowed 0.000000
## Armed.Forces 0.000000
## Craft.repair 0.000000
## Farming.fishing 0.000000
## Machine.op.inspct 0.000000
## Other.service 0.000000
## Priv.house.serv 0.000000
## Protective.serv 0.000000
## Sales 0.000000
## Tech.support 0.000000
## Transport.moving 0.000000
## Not.in.family 0.000000
## Other.relative 0.000000
## Own.child 0.000000
## Unmarried 0.000000
## Wife 0.000000
## Asian.Pac.Islander 0.000000
## Black 0.000000
## Other 0.000000
## White 0.000000
## Male 0.000000
## other_countries 0.000000
## Philippines 0.000000
## United.States 0.000000
#Visualization of variable importance
varimp <- data.frame(varimp, name = rownames(varimp))
ggplot(varimp, aes(x = reorder(name, -Overall), y = Overall)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(color = "blue", size = 6, angle = 90))
#Visualization of pruned tree
rpart.plot(treepruned)
#predicted income class from pruned tree object on train dataset
treepred1 <- predict(treepruned, newdata = newtrain2, type = "class")
#Confusion matrix - train dataset
confusion1 <- confusionMatrix(newtrain2$income, treepred1)
confusion1
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 23559 1161
## >50K 3670 4012
##
## Accuracy : 0.8509
## 95% CI : (0.847, 0.8548)
## No Information Rate : 0.8403
## P-Value [Acc > NIR] : 0.0000000869
##
## Kappa : 0.5356
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.8652
## Specificity : 0.7756
## Pos Pred Value : 0.9530
## Neg Pred Value : 0.5223
## Prevalence : 0.8403
## Detection Rate : 0.7271
## Detection Prevalence : 0.7629
## Balanced Accuracy : 0.8204
##
## 'Positive' Class : <=50K
##
#Training accuracy rate
(confusion1$table[1, 1] + confusion1$table[2, 2]) / sum(confusion1$table)
## [1] 0.8509043
treepred2 <- predict(treepruned, newdata = newtest2, type="class")
#Confusion matrix - test dataset
confusion2 <- confusionMatrix(newtest2$income, treepred2)
confusion2
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11872 563
## >50K 1837 1924
##
## Accuracy : 0.8518
## 95% CI : (0.8462, 0.8573)
## No Information Rate : 0.8464
## P-Value [Acc > NIR] : 0.02926
##
## Kappa : 0.5288
## Mcnemar's Test P-Value : < 0.0000000000000002
##
## Sensitivity : 0.8660
## Specificity : 0.7736
## Pos Pred Value : 0.9547
## Neg Pred Value : 0.5116
## Prevalence : 0.8464
## Detection Rate : 0.7330
## Detection Prevalence : 0.7678
## Balanced Accuracy : 0.8198
##
## 'Positive' Class : <=50K
##
#Misclassification Rate of prunned tree on test dataset
(confusion2$table[1, 2] + confusion2$table[2, 1]) / sum(confusion2$table)
## [1] 0.1481847
#Accuracy Rate of prunned tree on test dataset
(confusion2$table[1, 1] +confusion2$table[2, 2]) / sum(confusion2$table)
## [1] 0.8518153
#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Baseline model's ROC curve
#Getting predicted >50K of income probabilities
tree_prob <- predict(tree, newdata = newtest2, type = "prob")[, 2]
tree_prediction <- prediction(tree_prob, newtest2$income)
tree_performance <- ROCR::performance(tree_prediction, measure = "tpr", x.measure = "fpr")
#Plot ROC curve
plot(tree_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
tree.auc <- ROCR::performance(tree_prediction, measure = "auc")@y.values[[1]]
tree.auc
## [1] 0.8768653
#Pick the best threshold
str(tree_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
## ..@ x.name : chr "False positive rate"
## ..@ y.name : chr "True positive rate"
## ..@ alpha.name : chr "Cutoff"
## ..@ x.values :List of 1
## .. ..$ : num [1:15] 0 0.000241 0.000241 0.000483 0.001045 ...
## ..@ y.values :List of 1
## .. ..$ : num [1:15] 0 0.033 0.107 0.166 0.213 ...
## ..@ alpha.values:List of 1
## .. ..$ : num [1:15] Inf 1 0.995 0.982 0.967 ...
cutoffs <- data.frame(cut = tree_performance@alpha.values[[1]],
fpr = tree_performance@x.values[[1]],
tpr = tree_performance@y.values[[1]])
head(cutoffs)
## cut fpr tpr
## 1 Inf 0.0000000000 0.00000000
## 2 1.0000000 0.0002412545 0.03296995
## 3 0.9948365 0.0002412545 0.10688647
## 4 0.9817814 0.0004825090 0.16591332
## 5 0.9673367 0.0010454363 0.21270939
## 6 0.7522388 0.0038600724 0.24541345
roc <- pROC::roc(newtest2$income, tree_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is : " , threshold, "\n")
## The best threshold is : 0.1993402
#Get accuracy rate of testset data using the optimal threshold ****
confusionMatrix(tree_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 9084 491
## TRUE 3351 3270
##
## Accuracy : 0.7628
## 95% CI : (0.7562, 0.7693)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : 0.9351
##
## Kappa : 0.4742
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.7305
## Specificity : 0.8694
## Pos Pred Value : 0.9487
## Neg Pred Value : 0.4939
## Prevalence : 0.7678
## Detection Rate : 0.5609
## Detection Prevalence : 0.5912
## Balanced Accuracy : 0.8000
##
## 'Positive' Class : FALSE
##
#Pruned model's ROC curve
#Getting predicted >50K of income probabilities
pruned_prob <- predict(treepruned, newdata = newtest2, type = "prob")[, 2]
pruned_prediction <- prediction(pruned_prob, newtest2$income)
pruned_performance <- ROCR::performance(pruned_prediction, measure = "tpr", x.measure = "fpr")
#Plot ROC curve
plot(pruned_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
pruned.auc <- ROCR::performance(pruned_prediction,
measure="auc")@y.values[[1]]
pruned.auc
## [1] 0.858427
#Pick the best threshold
str(pruned_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
## ..@ x.name : chr "False positive rate"
## ..@ y.name : chr "True positive rate"
## ..@ alpha.name : chr "Cutoff"
## ..@ x.values :List of 1
## .. ..$ : num [1:12] 0 0 0.000322 0.000643 0.001206 ...
## ..@ y.values :List of 1
## .. ..$ : num [1:12] 0 0.0739 0.1072 0.1662 0.213 ...
## ..@ alpha.values:List of 1
## .. ..$ : num [1:12] Inf 0.995 0.978 0.978 0.967 ...
cutoffs <- data.frame(cut = pruned_performance@alpha.values[[1]],
fpr = pruned_performance@x.values[[1]],
tpr = pruned_performance@y.values[[1]])
head(cutoffs)
## cut fpr tpr
## 1 Inf 0.0000000000 0.00000000
## 2 0.9948365 0.0000000000 0.07391651
## 3 0.9784946 0.0003216727 0.10715235
## 4 0.9778226 0.0006433454 0.16617921
## 5 0.9673367 0.0012062726 0.21297527
## 6 0.7522388 0.0040209087 0.24567934
roc <- pROC::roc(newtest2$income, pruned_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is : " , threshold, "\n")
## The best threshold is : 0.2095364
#Get accuracy rate of testset data using the optimal threshold ****
confusionMatrix(pruned_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 9098 504
## TRUE 3337 3257
##
## Accuracy : 0.7628
## 95% CI : (0.7562, 0.7694)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : 0.9327
##
## Kappa : 0.4733
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.7316
## Specificity : 0.8660
## Pos Pred Value : 0.9475
## Neg Pred Value : 0.4939
## Prevalence : 0.7678
## Detection Rate : 0.5617
## Detection Prevalence : 0.5929
## Balanced Accuracy : 0.7988
##
## 'Positive' Class : FALSE
##
\(\\\)
\(\\\)
set.seed(100)
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
# Training the Decision Tree classifier with criterion as gini index
dtree_fit <- caret::train(income ~., data = newtrain2,
method = "rpart",
parms = list(split = "gini"),
trControl = trctrl,
tuneLength = 10)
dtree_fit
## CART
##
## 32402 samples
## 43 predictor
## 2 classes: '<=50K', '>50K'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.001171570 0.8578071 0.5689948
## 0.001366832 0.8577248 0.5670715
## 0.002212965 0.8565314 0.5584858
## 0.002629524 0.8552043 0.5505177
## 0.003558101 0.8514082 0.5409312
## 0.006769071 0.8451535 0.5205961
## 0.010999740 0.8448962 0.5199522
## 0.034105702 0.8385798 0.4924548
## 0.061702682 0.8264613 0.4422631
## 0.120997136 0.7879256 0.1876654
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.00117157.
#Tuning parameter - cp
dtree_fit$bestTune
## cp
## 1 0.00117157
#The model we selected by using the optimal cp we got
dtree_fit$finalModel
## n= 32402
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 32402 7682 <=50K (0.762915869 0.237084131)
## 2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)
## 4) capital.gain< 7073.5 17274 849 <=50K (0.950850990 0.049149010) *
## 5) capital.gain>=7073.5 284 11 >50K (0.038732394 0.961267606) *
## 3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)
## 6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)
## 12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)
## 24) education.num< 8.5 1656 167 <=50K (0.899154589 0.100845411) *
## 25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)
## 50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341)
## 100) age< 35.5 2704 552 <=50K (0.795857988 0.204142012) *
## 101) age>=35.5 5284 1990 <=50K (0.623391370 0.376608630)
## 202) hours.per.week< 34.5 578 91 <=50K (0.842560554 0.157439446) *
## 203) hours.per.week>=34.5 4706 1899 <=50K (0.596472588 0.403527412)
## 406) education.num< 9.5 2622 908 <=50K (0.653699466 0.346300534) *
## 407) education.num>=9.5 2084 991 <=50K (0.524472169 0.475527831)
## 814) Self.emp.not.inc>=0.5 245 69 <=50K (0.718367347 0.281632653) *
## 815) Self.emp.not.inc< 0.5 1839 917 >50K (0.498640566 0.501359434)
## 1630) Exec.managerial< 0.5 1504 704 <=50K (0.531914894 0.468085106)
## 3260) Tech.support< 0.5 1390 629 <=50K (0.547482014 0.452517986) *
## 3261) Tech.support>=0.5 114 39 >50K (0.342105263 0.657894737) *
## 1631) Exec.managerial>=0.5 335 117 >50K (0.349253731 0.650746269) *
## 51) capital.loss>=1782.5 335 83 >50K (0.247761194 0.752238806)
## 102) capital.loss>=1989.5 94 21 <=50K (0.776595745 0.223404255) *
## 103) capital.loss< 1989.5 241 10 >50K (0.041493776 0.958506224) *
## 13) capital.gain>=5095.5 496 11 >50K (0.022177419 0.977822581) *
## 7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)
## 14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)
## 28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)
## 56) hours.per.week< 31 306 112 <=50K (0.633986928 0.366013072)
## 112) Wife< 0.5 233 67 <=50K (0.712446352 0.287553648) *
## 113) Wife>=0.5 73 28 >50K (0.383561644 0.616438356) *
## 57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345)
## 114) age< 28.5 211 89 <=50K (0.578199052 0.421800948) *
## 115) age>=28.5 2873 923 >50K (0.321266968 0.678733032)
## 230) capital.gain>=3120 52 11 <=50K (0.788461538 0.211538462) *
## 231) capital.gain< 3120 2821 882 >50K (0.312655087 0.687344913)
## 462) Exec.managerial< 0.5 1998 695 >50K (0.347847848 0.652152152)
## 924) Prof.specialty< 0.5 911 398 >50K (0.436882547 0.563117453)
## 1848) Other.service>=0.5 35 8 <=50K (0.771428571 0.228571429) *
## 1849) Other.service< 0.5 876 371 >50K (0.423515982 0.576484018)
## 3698) Self.emp.not.inc>=0.5 123 50 <=50K (0.593495935 0.406504065) *
## 3699) Self.emp.not.inc< 0.5 753 298 >50K (0.395750332 0.604249668) *
## 925) Prof.specialty>=0.5 1087 297 >50K (0.273229071 0.726770929) *
## 463) Exec.managerial>=0.5 823 187 >50K (0.227217497 0.772782503) *
## 29) capital.loss>=1782.5 398 13 >50K (0.032663317 0.967336683) *
## 15) capital.gain>=5095.5 581 3 >50K (0.005163511 0.994836489) *
#Plot classification tree
prp(dtree_fit$finalModel, box.palette = "Reds", tweak = 0.8,
fallen.leaves = FALSE, faclen = 0, extra = 1)
#Variable importance
varimp2 <- varImp(dtree_fit$finalModel)
varimp2
## Overall
## age 1632.953509
## capital.gain 3279.194816
## capital.loss 979.941955
## education.num 2956.577327
## Exec.managerial 902.677175
## Farming.fishing 120.791701
## Handlers.cleaners 19.440365
## hours.per.week 508.081524
## Local.gov 1.464908
## Male 10.337328
## Married.civ.spouse 2298.950331
## Never.married 1175.483228
## other_countries 11.895201
## Other.service 155.426390
## Prof.specialty 347.449953
## Sales 12.786564
## Self.emp.not.inc 127.720699
## Tech.support 8.888007
## Transport.moving 5.897710
## Wife 12.024713
## No.gain 0.000000
## Private 0.000000
## Self.emp.inc 0.000000
## State.gov 0.000000
## Married.AF.spouse 0.000000
## Married.spouse.absent 0.000000
## Separated 0.000000
## Widowed 0.000000
## Armed.Forces 0.000000
## Craft.repair 0.000000
## Machine.op.inspct 0.000000
## Priv.house.serv 0.000000
## Protective.serv 0.000000
## Not.in.family 0.000000
## Other.relative 0.000000
## Own.child 0.000000
## Unmarried 0.000000
## Asian.Pac.Islander 0.000000
## Black 0.000000
## Other 0.000000
## White 0.000000
## Philippines 0.000000
## United.States 0.000000
#Visualization of variable importance
varimp2 <- data.frame(varimp2, name = rownames(varimp2))
ggplot(varimp2, aes(x = reorder(name, -Overall), y = Overall)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(color = "blue", size = 6, angle = 90))
#Predicted income class from the finalmodel tree object on train dataset
treepred3 <- predict(dtree_fit$finalModel, newdata = newtrain2, type = "class")
#Confusion matrix - train dataset
confusion3 <- confusionMatrix(newtrain2$income, treepred3)
confusion3
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 23706 1014
## >50K 3511 4171
##
## Accuracy : 0.8603
## 95% CI : (0.8565, 0.8641)
## No Information Rate : 0.84
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5653
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.8710
## Specificity : 0.8044
## Pos Pred Value : 0.9590
## Neg Pred Value : 0.5430
## Prevalence : 0.8400
## Detection Rate : 0.7316
## Detection Prevalence : 0.7629
## Balanced Accuracy : 0.8377
##
## 'Positive' Class : <=50K
##
#Training accuracy rate
(confusion3$table[1,1] + confusion3$table[2,2]) / sum(confusion3$table)
## [1] 0.8603481
#Predicted income class from the finalmodel tree object on test dataset
treepred4 <- predict(dtree_fit$finalModel, newdata = newtest2, type = "class")
#Confusion matrix - test dataset
confusion4 <- confusionMatrix(newtest2$income, treepred4)
confusion4
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11918 517
## >50K 1747 2014
##
## Accuracy : 0.8602
## 95% CI : (0.8548, 0.8655)
## No Information Rate : 0.8437
## P-Value [Acc > NIR] : 0.000000002442
##
## Kappa : 0.5575
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.8722
## Specificity : 0.7957
## Pos Pred Value : 0.9584
## Neg Pred Value : 0.5355
## Prevalence : 0.8437
## Detection Rate : 0.7359
## Detection Prevalence : 0.7678
## Balanced Accuracy : 0.8339
##
## 'Positive' Class : <=50K
##
#Misclassification Rate of finalmodel tree on test dataset
(confusion4$table[1, 2] + confusion4$table[2, 1]) / sum(confusion4$table)
## [1] 0.1397876
#Accuracy Rate of finalmodel tree on test dataset
(confusion4$table[1, 1] + confusion4$table[2, 2]) / sum(confusion4$table)
## [1] 0.8602124
#Getting predicted >50K of income probabilities
gini_prob <- predict(dtree_fit, newdata = newtest2, type = "prob")[, 2]
gini_prediction <- prediction(gini_prob, newtest2$income)
gini_performance <- ROCR::performance(gini_prediction, measure = "tpr", x.measure = "fpr")
#ROC Curve : https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Plot ROC curve
plot(gini_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
gini.auc <- ROCR::performance(gini_prediction, measure="auc")@y.values[[1]]
gini.auc
## [1] 0.8718559
#Pick the best threshold
str(gini_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
## ..@ x.name : chr "False positive rate"
## ..@ y.name : chr "True positive rate"
## ..@ alpha.name : chr "Cutoff"
## ..@ x.values :List of 1
## .. ..$ : num [1:25] 0 0 0.000322 0.000885 0.001287 ...
## ..@ y.values :List of 1
## .. ..$ : num [1:25] 0 0.0739 0.1329 0.1797 0.213 ...
## ..@ alpha.values:List of 1
## .. ..$ : num [1:25] Inf 0.995 0.978 0.967 0.961 ...
cutoffs <- data.frame(cut = gini_performance@alpha.values[[1]],
fpr = gini_performance@x.values[[1]],
tpr = gini_performance@y.values[[1]])
head(cutoffs)
## cut fpr tpr
## 1 Inf 0.0000000000 0.00000000
## 2 0.9948365 0.0000000000 0.07391651
## 3 0.9778226 0.0003216727 0.13294337
## 4 0.9673367 0.0008845999 0.17973943
## 5 0.9612676 0.0012866908 0.21297527
## 6 0.9585062 0.0016887817 0.24381813
roc <- pROC::roc(newtest2$income, gini_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is : ", threshold, "\n")
## The best threshold is : 0.2259878
#Get accuracy rate of testset data using the optimal threshold ****
confusionMatrix(gini_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 10364 861
## TRUE 2071 2900
##
## Accuracy : 0.819
## 95% CI : (0.8129, 0.8249)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5435
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.8335
## Specificity : 0.7711
## Pos Pred Value : 0.9233
## Neg Pred Value : 0.5834
## Prevalence : 0.7678
## Detection Rate : 0.6399
## Detection Prevalence : 0.6931
## Balanced Accuracy : 0.8023
##
## 'Positive' Class : FALSE
##
#====================================================================
#Training the Decision Tree classifier with criterion as information gain(cross entropy)
set.seed(100)
dtree_fit_info <- caret::train(income ~., data = newtrain2, method = "rpart",
parms = list(split = "information"),
trControl = trctrl,
tuneLength = 10)
dtree_fit_info
## CART
##
## 32402 samples
## 43 predictor
## 2 classes: '<=50K', '>50K'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.001171570 0.8576940 0.5683314
## 0.001366832 0.8572001 0.5639238
## 0.002212965 0.8547208 0.5507302
## 0.002629524 0.8536818 0.5444061
## 0.003558101 0.8501429 0.5352532
## 0.006769071 0.8442687 0.5163148
## 0.010999740 0.8432194 0.5114075
## 0.034105702 0.8385798 0.4924548
## 0.061702682 0.8264613 0.4422631
## 0.120997136 0.7879256 0.1876654
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.00117157.
#Tuning parameter - cp
dtree_fit_info$bestTune
## cp
## 1 0.00117157
#The model we selected by using the optimal cp we got
dtree_fit_info$finalModel
## n= 32402
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 32402 7682 <=50K (0.762915869 0.237084131)
## 2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)
## 4) capital.gain< 7073.5 17274 849 <=50K (0.950850990 0.049149010) *
## 5) capital.gain>=7073.5 284 11 >50K (0.038732394 0.961267606) *
## 3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)
## 6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)
## 12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)
## 24) education.num< 8.5 1656 167 <=50K (0.899154589 0.100845411) *
## 25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)
## 50) age< 35.5 2782 599 <=50K (0.784687275 0.215312725)
## 100) age< 24.5 338 19 <=50K (0.943786982 0.056213018) *
## 101) age>=24.5 2444 580 <=50K (0.762684124 0.237315876)
## 202) capital.loss< 1794 2371 533 <=50K (0.775200337 0.224799663) *
## 203) capital.loss>=1794 73 26 >50K (0.356164384 0.643835616)
## 406) capital.loss>=1989.5 25 1 <=50K (0.960000000 0.040000000) *
## 407) capital.loss< 1989.5 48 2 >50K (0.041666667 0.958333333) *
## 51) age>=35.5 5541 2195 <=50K (0.603862119 0.396137881)
## 102) capital.loss< 1782.5 5284 1990 <=50K (0.623391370 0.376608630)
## 204) hours.per.week< 34.5 578 91 <=50K (0.842560554 0.157439446) *
## 205) hours.per.week>=34.5 4706 1899 <=50K (0.596472588 0.403527412)
## 410) education.num< 9.5 2622 908 <=50K (0.653699466 0.346300534) *
## 411) education.num>=9.5 2084 991 <=50K (0.524472169 0.475527831)
## 822) Self.emp.not.inc>=0.5 245 69 <=50K (0.718367347 0.281632653) *
## 823) Self.emp.not.inc< 0.5 1839 917 >50K (0.498640566 0.501359434)
## 1646) Exec.managerial< 0.5 1504 704 <=50K (0.531914894 0.468085106)
## 3292) Handlers.cleaners>=0.5 38 5 <=50K (0.868421053 0.131578947) *
## 3293) Handlers.cleaners< 0.5 1466 699 <=50K (0.523192360 0.476807640)
## 6586) Other.service>=0.5 59 13 <=50K (0.779661017 0.220338983) *
## 6587) Other.service< 0.5 1407 686 <=50K (0.512437811 0.487562189)
## 13174) capital.loss>=1532 12 0 <=50K (1.000000000 0.000000000) *
## 13175) capital.loss< 1532 1395 686 <=50K (0.508243728 0.491756272)
## 26350) Transport.moving>=0.5 102 31 <=50K (0.696078431 0.303921569) *
## 26351) Transport.moving< 0.5 1293 638 >50K (0.493426141 0.506573859)
## 52702) capital.gain>=4699.5 10 0 <=50K (1.000000000 0.000000000) *
## 52703) capital.gain< 4699.5 1283 628 >50K (0.489477786 0.510522214)
## 105406) Tech.support< 0.5 1171 580 <=50K (0.504696840 0.495303160)
## 210812) Machine.op.inspct>=0.5 95 32 <=50K (0.663157895 0.336842105) *
## 210813) Machine.op.inspct< 0.5 1076 528 >50K (0.490706320 0.509293680)
## 421626) age>=58.5 91 32 <=50K (0.648351648 0.351648352) *
## 421627) age< 58.5 985 469 >50K (0.476142132 0.523857868)
## 843254) hours.per.week< 43.5 623 306 <=50K (0.508828250 0.491171750)
## 1686508) capital.gain< 4225 616 299 <=50K (0.514610390 0.485389610)
## 3373016) capital.gain>=3120 8 0 <=50K (1.000000000 0.000000000) *
## 3373017) capital.gain< 3120 608 299 <=50K (0.508223684 0.491776316)
## 6746034) Protective.serv< 0.5 566 268 <=50K (0.526501767 0.473498233)
## 13492068) Prof.specialty< 0.5 477 213 <=50K (0.553459119 0.446540881)
## 26984136) age< 47.5 315 124 <=50K (0.606349206 0.393650794) *
## 26984137) age>=47.5 162 73 >50K (0.450617284 0.549382716) *
## 13492069) Prof.specialty>=0.5 89 34 >50K (0.382022472 0.617977528) *
## 6746035) Protective.serv>=0.5 42 11 >50K (0.261904762 0.738095238) *
## 1686509) capital.gain>=4225 7 0 >50K (0.000000000 1.000000000) *
## 843255) hours.per.week>=43.5 362 152 >50K (0.419889503 0.580110497) *
## 105407) Tech.support>=0.5 112 37 >50K (0.330357143 0.669642857) *
## 1647) Exec.managerial>=0.5 335 117 >50K (0.349253731 0.650746269) *
## 103) capital.loss>=1782.5 257 52 >50K (0.202334630 0.797665370)
## 206) capital.loss>=1989.5 66 20 <=50K (0.696969697 0.303030303) *
## 207) capital.loss< 1989.5 191 6 >50K (0.031413613 0.968586387) *
## 13) capital.gain>=5095.5 496 11 >50K (0.022177419 0.977822581) *
## 7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)
## 14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)
## 28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)
## 56) hours.per.week< 31 306 112 <=50K (0.633986928 0.366013072)
## 112) Wife< 0.5 233 67 <=50K (0.712446352 0.287553648) *
## 113) Wife>=0.5 73 28 >50K (0.383561644 0.616438356) *
## 57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345)
## 114) age< 28.5 211 89 <=50K (0.578199052 0.421800948) *
## 115) age>=28.5 2873 923 >50K (0.321266968 0.678733032)
## 230) capital.gain>=3120 52 11 <=50K (0.788461538 0.211538462) *
## 231) capital.gain< 3120 2821 882 >50K (0.312655087 0.687344913)
## 462) Exec.managerial< 0.5 1998 695 >50K (0.347847848 0.652152152)
## 924) Prof.specialty< 0.5 911 398 >50K (0.436882547 0.563117453)
## 1848) Other.service>=0.5 35 8 <=50K (0.771428571 0.228571429) *
## 1849) Other.service< 0.5 876 371 >50K (0.423515982 0.576484018)
## 3698) Self.emp.not.inc>=0.5 123 50 <=50K (0.593495935 0.406504065) *
## 3699) Self.emp.not.inc< 0.5 753 298 >50K (0.395750332 0.604249668) *
## 925) Prof.specialty>=0.5 1087 297 >50K (0.273229071 0.726770929) *
## 463) Exec.managerial>=0.5 823 187 >50K (0.227217497 0.772782503) *
## 29) capital.loss>=1782.5 398 13 >50K (0.032663317 0.967336683) *
## 15) capital.gain>=5095.5 581 3 >50K (0.005163511 0.994836489) *
#Plot classification tree
prp(dtree_fit_info$finalModel, box.palette = "Blues", tweak = 1.2, extra = 1)
#Variable importance
varimp3 <- varImp(dtree_fit_info$finalModel)
varimp3
## Overall
## age 2666.4332900
## capital.gain 4204.0735108
## capital.loss 1246.0965411
## Craft.repair 19.5942637
## education.num 3823.9420314
## Exec.managerial 970.5439782
## Farming.fishing 224.4187849
## Handlers.cleaners 21.8938004
## hours.per.week 895.4359272
## Machine.op.inspct 27.5632778
## Male 10.8358134
## Married.civ.spouse 3386.3856340
## Never.married 1987.9976540
## other_countries 13.0766863
## Other.service 137.7986339
## Own.child 230.5841961
## Private 2.5392738
## Prof.specialty 368.7640471
## Protective.serv 18.2255630
## Sales 17.5572307
## Self.emp.inc 0.8764151
## Self.emp.not.inc 139.2962504
## Tech.support 47.1275640
## Transport.moving 22.3790987
## United.States 2.6915940
## Wife 12.5911026
## Local.gov 0.0000000
## No.gain 0.0000000
## State.gov 0.0000000
## Married.AF.spouse 0.0000000
## Married.spouse.absent 0.0000000
## Separated 0.0000000
## Widowed 0.0000000
## Armed.Forces 0.0000000
## Priv.house.serv 0.0000000
## Not.in.family 0.0000000
## Other.relative 0.0000000
## Unmarried 0.0000000
## Asian.Pac.Islander 0.0000000
## Black 0.0000000
## Other 0.0000000
## White 0.0000000
## Philippines 0.0000000
#Visualization of variable importance
varimp3 <- data.frame(varimp3, name = rownames(varimp3))
ggplot(varimp2, aes(x = reorder(name, -Overall), y = Overall)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(color = "blue", size = 6, angle = 90))
#Predicted income class from the finalmodel tree object on train dataset
treepred <- predict(dtree_fit_info$finalModel, newdata = newtrain2, type = "class")
#Confusion matrix - train dataset
confusion <- confusionMatrix(newtrain2$income, treepred)
confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 23440 1280
## >50K 3119 4563
##
## Accuracy : 0.8642
## 95% CI : (0.8605, 0.8679)
## No Information Rate : 0.8197
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.591
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.8826
## Specificity : 0.7809
## Pos Pred Value : 0.9482
## Neg Pred Value : 0.5940
## Prevalence : 0.8197
## Detection Rate : 0.7234
## Detection Prevalence : 0.7629
## Balanced Accuracy : 0.8317
##
## 'Positive' Class : <=50K
##
#Training accuracy rate
(confusion$table[1,1] + confusion$table[2,2]) / sum(confusion$table)
## [1] 0.8642368
#Predicted income class from the finalmodel tree object on test dataset
treepred1 <- predict(dtree_fit_info$finalModel, newdata = newtest2, type = "class")
#Confusion matrix - test dataset
confusion1 <- confusionMatrix(newtest2$income, treepred1)
confusion1
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11767 668
## >50K 1585 2176
##
## Accuracy : 0.8609
## 95% CI : (0.8555, 0.8662)
## No Information Rate : 0.8244
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5736
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.8813
## Specificity : 0.7651
## Pos Pred Value : 0.9463
## Neg Pred Value : 0.5786
## Prevalence : 0.8244
## Detection Rate : 0.7265
## Detection Prevalence : 0.7678
## Balanced Accuracy : 0.8232
##
## 'Positive' Class : <=50K
##
#Misclassification Rate of finalmodel tree on test dataset
(confusion1$table[1, 2] + confusion1$table[2, 1]) / sum(confusion1$table)
## [1] 0.1391084
#Accuracy Rate of finalmodel tree on test dataset
(confusion1$table[1, 1] + confusion1$table[2, 2]) / sum(confusion1$table)
## [1] 0.8608916
#Getting predicted >50K of income probabilities
info_prob <- predict(dtree_fit_info, newdata = newtest2, type = "prob")[, 2]
info_prediction <- prediction(info_prob, newtest2$income)
info_performance <- ROCR::performance(info_prediction, measure = "tpr", x.measure = "fpr")
#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Plot ROC curve
plot(info_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
info.auc <- ROCR::performance(info_prediction, measure = "auc")@y.values[[1]]
info.auc
## [1] 0.872279
#Pick the best threshold======================= not for accuracy
str(info_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
## ..@ x.name : chr "False positive rate"
## ..@ y.name : chr "True positive rate"
## ..@ alpha.name : chr "Cutoff"
## ..@ x.values :List of 1
## .. ..$ : num [1:39] 0 0 0 0.000322 0.000643 ...
## ..@ y.values :List of 1
## .. ..$ : num [1:39] 0 0.000798 0.074714 0.133741 0.157937 ...
## ..@ alpha.values:List of 1
## .. ..$ : num [1:39] Inf 1 0.995 0.978 0.969 ...
cutoffs <- data.frame(cut = info_performance@alpha.values[[1]],
fpr = info_performance@x.values[[1]],
tpr = info_performance@y.values[[1]])
head(cutoffs)
## cut fpr tpr
## 1 Inf 0.0000000000 0.0000000000
## 2 1.0000000 0.0000000000 0.0007976602
## 3 0.9948365 0.0000000000 0.0747141718
## 4 0.9778226 0.0003216727 0.1337410263
## 5 0.9685864 0.0006433454 0.1579367190
## 6 0.9673367 0.0012062726 0.2047327838
roc <- pROC::roc(newtest2$income, info_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is : ", threshold, "\n")
## The best threshold is : 0.184489
#Get accuracy rate of testset data using the optimal threshold ****
confusionMatrix(info_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 9549 574
## TRUE 2886 3187
##
## Accuracy : 0.7864
## 95% CI : (0.78, 0.7927)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : 0.000000008282
##
## Kappa : 0.5067
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.7679
## Specificity : 0.8474
## Pos Pred Value : 0.9433
## Neg Pred Value : 0.5248
## Prevalence : 0.7678
## Detection Rate : 0.5896
## Detection Prevalence : 0.6250
## Balanced Accuracy : 0.8076
##
## 'Positive' Class : FALSE
##
\(\\\)
\(\\\)
set.seed(100)
#Compare ROC curve
plot(pruned_performance, main = "ROC curve", col = "blue")
plot(gini_performance, add = TRUE, col = "red")
plot(tree_performance, add = TRUE, col = "green")
plot(info_performance, add = TRUE)
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Pruned - 1st method", "Tunned - 2nd method",
"Tunned - 3rd method","unprunned"),
col=c("blue", "red", "black", "green"), lwd=3, cex=.45, horiz = TRUE)
\(\\\)
\(\\\)
set.seed(100)
thresholds <- seq(from = 0.001, 0.999, 0.001)
accuracy <- c()
#Using train dataset to check new accuracy driven by new threshold
gini_prob.train <- predict(dtree_fit, newdata = newtrain2,
type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((gini_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres1 <- which.max(accuracy) * 0.001
thres1
## [1] 0.453
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(gini_prob > thres1, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11918 1747
## TRUE 517 2014
##
## Accuracy : 0.8602
## 95% CI : (0.8548, 0.8655)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5575
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9584
## Specificity : 0.5355
## Pos Pred Value : 0.8722
## Neg Pred Value : 0.7957
## Prevalence : 0.7678
## Detection Rate : 0.7359
## Detection Prevalence : 0.8437
## Balanced Accuracy : 0.7470
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
prunned.gini.accuracy <- mean((gini_prob > thres1) == (newtest2$income == ">50K"))
#Test accuracy rate by using default threshold(0.5)
prunned.gini.accuracy.half <- mean((gini_prob > 0.5) == (newtest2$income == ">50K"))
#==================================================================
#Using train dataset to check new accuracy driven by new threshold
info_prob.train <- predict(dtree_fit_info, newdata = newtrain2,
type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((info_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres2 <- which.max(accuracy) * 0.001
thres2
## [1] 0.422
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(info_prob > thres2, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11767 1585
## TRUE 668 2176
##
## Accuracy : 0.8609
## 95% CI : (0.8555, 0.8662)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5736
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9463
## Specificity : 0.5786
## Pos Pred Value : 0.8813
## Neg Pred Value : 0.7651
## Prevalence : 0.7678
## Detection Rate : 0.7265
## Detection Prevalence : 0.8244
## Balanced Accuracy : 0.7624
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
prunned.info.accuracy <- mean((info_prob > thres2) == (newtest2$income == ">50K"))
#Test accuracy rate by using default threshold(0.5)
prunned.info.accuracy.half <- mean((info_prob > 0.5) == (newtest2$income == ">50K"))
#==================================================================
#Using train dataset to check new accuracy driven by new threshold
tree_prob.train <- predict(tree, newdata = newtrain2, type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((tree_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres3 <- which.max(accuracy) * 0.001
thres3
## [1] 0.367
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(tree_prob > thres3, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11859 1825
## TRUE 576 1936
##
## Accuracy : 0.8518
## 95% CI : (0.8462, 0.8572)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5298
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9537
## Specificity : 0.5148
## Pos Pred Value : 0.8666
## Neg Pred Value : 0.7707
## Prevalence : 0.7678
## Detection Rate : 0.7322
## Detection Prevalence : 0.8449
## Balanced Accuracy : 0.7342
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
unprunned.accuracy <- mean((tree_prob > thres3) == (newtest2$income == ">50K"))
#Test accuracy rate by using optimal threshold
unprunned.accuracy.half <- mean((tree_prob > 0.5) == (newtest2$income == ">50K"))
#==================================================================
#Using train dataset to check new accuracy driven by new threshold
pruned_prob.train <- predict(treepruned, newdata = newtrain2,
type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((pruned_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres4 <- which.max(accuracy) * 0.001
thres4
## [1] 0.367
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(pruned_prob > thres4, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11872 1837
## TRUE 563 1924
##
## Accuracy : 0.8518
## 95% CI : (0.8462, 0.8573)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5288
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9547
## Specificity : 0.5116
## Pos Pred Value : 0.8660
## Neg Pred Value : 0.7736
## Prevalence : 0.7678
## Detection Rate : 0.7330
## Detection Prevalence : 0.8464
## Balanced Accuracy : 0.7331
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
prunned.accuracy <- mean((pruned_prob > thres4) == (newtest2$income == ">50K"))
prunned.accuracy.half <- mean((pruned_prob > 0.5) == (newtest2$income == ">50K"))
set.seed(100)
#Compare AUC
auc <- data.frame(pruned.auc, info.auc, gini.auc, tree.auc)
auc[, order(auc)]
## pruned.auc gini.auc info.auc tree.auc
## 1 0.858427 0.8718559 0.872279 0.8768653
#Pick the model with the largest AUC - unprunned tree
final.auc1 <- tree
#Compare Accuracy - optimal threshold
accuracy.tree.df <- data.frame(unprunned.accuracy, prunned.accuracy,
prunned.gini.accuracy, prunned.info.accuracy)
accuracy.tree.df[, order(accuracy.tree.df)]
## unprunned.accuracy prunned.accuracy prunned.gini.accuracy
## 1 0.8517535 0.8518153 0.8602124
## prunned.info.accuracy
## 1 0.8608916
#Pick the model with the highest Accuracy - prunned.info.accuracy
final.thres1 <- dtree_fit_info
#Compare Accuracy - default threshold (0.5)
accuracy.tree.df.half <- data.frame(unprunned.accuracy.half,
prunned.accuracy.half,
prunned.gini.accuracy.half,
prunned.info.accuracy.half)
accuracy.tree.df.half[, order(accuracy.tree.df.half)]
## unprunned.accuracy.half prunned.accuracy.half prunned.gini.accuracy.half
## 1 0.8517535 0.8518153 0.8602124
## prunned.info.accuracy.half
## 1 0.8608916
#Pick the model with the highest Accuracy - - prunned.info.accuracy
final.thres1.half <- dtree_fit_info
Comment:
We found the fact that when we apply the threshold which gives us maximum accuracy rate for the training datset to predicting the test dataset, we will eventually end up with the same accuracy rate that we got from the test dataset.
We picked the unprunned tree model from AUC model selection and tunned model by information gain(cross entropy) from Accuracy rate.
set.seed(100)
#=============================================================
#Create a task
traintask <- makeClassifTask(data = newtrain2, target = "income", positive = ">50K")
testtask <- makeClassifTask(data = newtest2, target = "income", positive = ">50K")
#Brief view of trainTask
traintask
## Supervised task: newtrain2
## Type: classif
## Target: income
## Observations: 32402
## Features:
## numerics factors ordered
## 43 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Classes: 2
## <=50K >50K
## 24720 7682
## Positive class: >50K
#For deeper View
str(getTaskData(traintask))
## 'data.frame': 32402 obs. of 44 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ Local.gov : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : int 0 0 1 1 1 1 1 0 1 1 ...
## $ Self.emp.inc : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Self.emp.not.inc : int 0 1 0 0 0 0 0 1 0 0 ...
## $ State.gov : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Married.AF.spouse : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Married.civ.spouse : int 0 1 0 1 1 1 0 1 0 1 ...
## $ Married.spouse.absent: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Never.married : int 1 0 0 0 0 0 0 0 1 0 ...
## $ Separated : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed.Forces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft.repair : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Exec.managerial : int 0 1 0 0 0 1 0 1 0 1 ...
## $ Farming.fishing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Handlers.cleaners : int 0 0 1 1 0 0 0 0 0 0 ...
## $ Machine.op.inspct : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Other.service : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Priv.house.serv : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof.specialty : int 0 0 0 0 1 0 0 0 1 0 ...
## $ Protective.serv : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sales : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tech.support : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport.moving : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Not.in.family : int 1 0 1 0 0 0 1 0 1 0 ...
## $ Other.relative : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Own.child : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Unmarried : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Wife : int 0 0 0 0 1 1 0 0 0 0 ...
## $ Asian.Pac.Islander : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : int 0 0 0 1 1 0 1 0 0 0 ...
## $ Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ White : int 1 1 1 0 0 1 0 1 1 1 ...
## $ Male : int 1 1 1 1 0 0 0 1 0 1 ...
## $ other_countries : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Philippines : int 0 0 0 0 0 0 0 0 0 0 ...
## $ United.States : int 1 1 1 1 0 1 0 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
#Create a bagging learner
bagged <- makeLearner("classif.rpart", parms = list(split = "gini"),
predict.type = "response")
#Set up the bagging algorithm which will grow 100 trees on randomized samples of data with replacement.
bag <- makeBaggingWrapper(learner = bagged, bw.iters = 100, bw.replace = TRUE)
# Q : bw.iters [integer(1)] Iterations = number of fitted models in bagging. Default is 10
#To check the performance, set up a validation strategy
#set 3 fold cross validation
rdesc <- makeResampleDesc("CV", iters = 3L)
# With 100 trees, bagging has returned an accuracy of 84.5%
r <- resample(learner = bag , task = traintask, resampling = rdesc,
measures = list(tpr, fpr, fnr, tnr, acc), show.info = T)
## [Resample] cross-validation iter 1: tpr.test.mean=0.522,fpr.test.mean=0.0512,fnr.test.mean=0.478,tnr.test.mean=0.949,acc.test.mean=0.846
## [Resample] cross-validation iter 2: tpr.test.mean=0.528,fpr.test.mean=0.0549,fnr.test.mean=0.472,tnr.test.mean=0.945,acc.test.mean=0.849
## [Resample] cross-validation iter 3: tpr.test.mean=0.494,fpr.test.mean=0.0526,fnr.test.mean=0.506,tnr.test.mean=0.947,acc.test.mean=0.839
## [Resample] Aggr. Result: tpr.test.mean=0.515,fpr.test.mean=0.0529,fnr.test.mean=0.485,tnr.test.mean=0.947,acc.test.mean=0.845
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from bagged model
r
## Resample Result
## Task: newtrain2
## Learner: classif.rpart.bagged
## Aggr perf: tpr.test.mean=0.515,fpr.test.mean=0.0529,fnr.test.mean=0.485,tnr.test.mean=0.947,acc.test.mean=0.845
## Runtime: 117.164
#Aggr. Result: tpr.test.mean=0.514,fpr.test.mean=0.0554,fnr.test.mean=0.486,tnr.test.mean=0.945,acc.test.mean=0.843
#=============================================================
#Make a random bagged learner (mtry = number of variables in dataset)
bag.rf <- makeLearner("classif.randomForest", predict.type = "response",
par.vals = list(ntree = 50L, mtry = 43,
importance = TRUE))
r2 <- resample(learner = bag.rf, task = traintask, resampling = rdesc,
measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.626,fpr.test.mean=0.0816,fnr.test.mean=0.374,tnr.test.mean=0.918,acc.test.mean=0.849
## [Resample] cross-validation iter 2: tpr.test.mean=0.613,fpr.test.mean=0.088,fnr.test.mean=0.387,tnr.test.mean=0.912,acc.test.mean=0.84
## [Resample] cross-validation iter 3: tpr.test.mean=0.628,fpr.test.mean=0.0824,fnr.test.mean=0.372,tnr.test.mean=0.918,acc.test.mean=0.85
## [Resample] Aggr. Result: tpr.test.mean=0.622,fpr.test.mean=0.084,fnr.test.mean=0.378,tnr.test.mean=0.916,acc.test.mean=0.846
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r2
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.622,fpr.test.mean=0.084,fnr.test.mean=0.378,tnr.test.mean=0.916,acc.test.mean=0.846
## Runtime: 64.1211
#Aggr perf: tpr.test.mean=0.636,fpr.test.mean=0.0883,fnr.test.mean=0.364,tnr.test.mean=0.912,acc.test.mean=0.846
#Internally, random forest uses a cutoff of 0.5 -->
#if a particular unseen observation has a probability higher than 0.5, it will be classified as >50K.
#In random forest, we have the option to customize the internal cutoff. As the false negative rate is very high now, we'll increase the cutoff for negative classes (<=50K) and accordingly reduce it for positive classes (>50K). Then, train the model again.
#Evaluating by using new cutoff
bag.rf$par.vals <- list(ntree = 50L, mtry = 43, importance = TRUE, cutoff = c(0.55, 0.45))
r3 <- resample(learner = bag.rf, task = traintask, resampling = rdesc,
measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.647,fpr.test.mean=0.103,fnr.test.mean=0.353,tnr.test.mean=0.897,acc.test.mean=0.84
## [Resample] cross-validation iter 2: tpr.test.mean=0.69,fpr.test.mean=0.102,fnr.test.mean=0.31,tnr.test.mean=0.898,acc.test.mean=0.848
## [Resample] cross-validation iter 3: tpr.test.mean=0.659,fpr.test.mean=0.102,fnr.test.mean=0.341,tnr.test.mean=0.898,acc.test.mean=0.84
## [Resample] Aggr. Result: tpr.test.mean=0.665,fpr.test.mean=0.103,fnr.test.mean=0.335,tnr.test.mean=0.897,acc.test.mean=0.842
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r3
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.665,fpr.test.mean=0.103,fnr.test.mean=0.335,tnr.test.mean=0.897,acc.test.mean=0.842
## Runtime: 59.9893
#Aggr perf: tpr.test.mean=0.636,fpr.test.mean=0.0646,fnr.test.mean=0.364,tnr.test.mean=0.935,acc.test.mean=0.864 ---> we can see that false negative rate is decreased even though the accuracy rate stays the same. I have tried cutoff = c(0.6, 0.4), cutoff = c(0.7, 0.3) but they all gave lower accuracy late.
#========================================================================
#Let's see how the test classification error changes as we increase the number of trees for untunned model ( #number of trees VS test classification error)
#Train a old untunned model
untunnedbagged <- mlr::train(bag.rf, traintask)
bag.untunned_ind <- predict(untunnedbagged$learner.model, newtrain2,
predict.all = T)$individual
head(bag.untunned_ind, 2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## 1 "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
## [,46] [,47] [,48] [,49] [,50]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
n <- dim(bag.untunned_ind)[1]
m <- ceiling(dim(bag.untunned_ind)[2] / 2)
predicted_ind <- c()
misclass.ind <- c()
for(i in 1:m){ # number of tree
for(j in 1:n){
predicted_ind[j] <- names(which.max(table(bag.untunned_ind[j, 1:i*2-1])))
}
misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}
bag.untunned.df <- data.frame(misclass.ind, ntree = seq(1, 50, 2))
ggplot(bag.untunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
ggtitle("Number of trees vs Misclassification rate in training dataset - untunned bagged model")
#======================== Let's actually tune the hyperparameters
#Bagged tree tuning
getParamSet(bag.rf)
## Type len Def Constr Req Tunable Trafo
## ntree integer - 500 1 to Inf - TRUE -
## mtry integer - - 1 to Inf - TRUE -
## replace logical - TRUE - - TRUE -
## classwt numericvector <NA> - 0 to Inf - TRUE -
## cutoff numericvector <NA> - 0 to 1 - TRUE -
## strata untyped - - - - FALSE -
## sampsize integervector <NA> - 1 to Inf - TRUE -
## nodesize integer - 1 1 to Inf - TRUE -
## maxnodes integer - - 1 to Inf - TRUE -
## importance logical - FALSE - - TRUE -
## localImp logical - FALSE - - TRUE -
## proximity logical - FALSE - - FALSE -
## oob.prox logical - - - Y FALSE -
## norm.votes logical - TRUE - - FALSE -
## do.trace logical - FALSE - - FALSE -
## keep.forest logical - TRUE - - FALSE -
## keep.inbag logical - FALSE - - FALSE -
#Specifying the search space for hyperparameters
bag.rf_params <- makeParamSet(makeIntegerParam("nodesize",
lower = 10, upper = 50),
makeIntegerParam("ntree", lower = 3, upper = 100))
#Set validation strategy
rdesc <- makeResampleDesc("CV", iters = 3L)
#Set optimization technique
bag.rf_ctrl <- makeTuneControlRandom(maxit = 5L)
#Start Hypertuning the parameters
bag.rf_tune <- tuneParams(learner = bag.rf, task = traintask,
resampling = rdesc,
measures = list(acc), par.set = bag.rf_params,
control = bag.rf_ctrl, show.info = TRUE)
## [Tune] Started tuning learner classif.randomForest for parameter set:
## Type len Def Constr Req Tunable Trafo
## nodesize integer - - 10 to 50 - TRUE -
## ntree integer - - 3 to 100 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0
## [Tune-x] 1: nodesize=21; ntree=24
## [Tune-y] 1: acc.test.mean=0.857; time: 0.5 min
## [Tune-x] 2: nodesize=17; ntree=51
## [Tune-y] 2: acc.test.mean=0.856; time: 1.0 min
## [Tune-x] 3: nodesize=17; ntree=66
## [Tune-y] 3: acc.test.mean=0.858; time: 1.3 min
## [Tune-x] 4: nodesize=43; ntree=68
## [Tune-y] 4: acc.test.mean=0.863; time: 1.2 min
## [Tune-x] 5: nodesize=29; ntree=52
## [Tune-y] 5: acc.test.mean=0.861; time: 0.9 min
## [Tune] Result: nodesize=43; ntree=68 : acc.test.mean=0.863
#Optimal hypertuned parameters
bag.rf_tune$x
## $nodesize
## [1] 43
##
## $ntree
## [1] 68
#Accuracy rate from Cross Validation
bag.rf_tune$y
## acc.test.mean
## 0.8625703
#Use hyperparameters for modeling
bag.rf_tree <- setHyperPars(bag.rf, par.vals = bag.rf_tune$x)
#Train a model
bag.rforest <- mlr::train(bag.rf_tree, traintask)
getLearnerModel(bag.rforest)
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L)
## Type of random forest: classification
## Number of trees: 68
## No. of variables tried at each split: 43
##
## OOB estimate of error rate: 13.66%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 22960 1760 0.07119741
## >50K 2666 5016 0.34704504
#***Make plots for random forest model
#========================================================================
#Let's see how the test classification error changes as we increase the number of trees for tunned model ( #number of trees VS test classification error)
bag.tunned_ind <- predict(bag.rforest$learner.model, newtrain2,
predict.all = T)$individual
head(bag.tunned_ind, 2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K"
## [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
## [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,46] [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
## 2 "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
## [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62] [,63]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K"
## [,64] [,65] [,66] [,67] [,68]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K" "<=50K"
n <- dim(bag.tunned_ind)[1]
m <- ceiling(dim(bag.tunned_ind)[2] / 2)
predicted_ind <- c()
misclass.ind <- c()
for(i in 1:m){ # number of tree
for(j in 1:n){
predicted_ind[j] <- names(which.max(table(bag.tunned_ind[j, 1:i*2-1])))
}
misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}
bag.tunned.df <- data.frame(misclass.ind, ntree = seq(1, 68, 2))
ggplot(bag.tunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
ggtitle("Number of trees vs Misclassification rate in training dataset - tunned bagged model")
#Variable importance statistics
varImpPlot(bag.rforest$learner.model)
importance(bag.rforest$learner.model)
## <=50K >50K MeanDecreaseAccuracy
## age -13.022404619 67.4753667 40.16504758
## education.num 14.906765795 60.3634774 52.75366374
## capital.gain 95.990371633 100.2240681 120.62875611
## capital.loss 29.881885811 68.2701358 50.74005081
## hours.per.week -5.344224745 37.2771623 27.71506590
## Local.gov 7.904425737 -0.9551792 5.76331191
## No.gain 0.000000000 0.0000000 0.00000000
## Private 8.298374351 -1.9894435 7.92407861
## Self.emp.inc 4.409166626 1.2369293 4.21427059
## Self.emp.not.inc 17.765098169 0.7731384 17.40876070
## State.gov 7.599717792 -3.5201521 4.82212848
## Married.AF.spouse -1.899616305 4.0462318 1.74721957
## Married.civ.spouse 24.951055410 74.4942455 85.35353206
## Married.spouse.absent -3.587531903 0.2389463 -3.40144767
## Never.married 3.141743832 -1.8860144 3.03550001
## Separated -0.311222932 0.5310407 -0.06140906
## Widowed 1.024164894 -0.1165945 0.89627638
## Armed.Forces 0.000000000 0.0000000 0.00000000
## Craft.repair 2.512301411 3.4041561 7.09837298
## Exec.managerial 2.503491752 14.2502558 17.78909167
## Farming.fishing 7.289129670 7.6609658 12.85222437
## Handlers.cleaners 1.601077058 10.3739954 10.56789812
## Machine.op.inspct 2.463727194 8.0450045 10.09705795
## Other.service -13.238964972 18.1797187 16.11203961
## Priv.house.serv -1.007435047 0.0000000 -1.00743505
## Prof.specialty 9.325886134 6.3193696 14.03067808
## Protective.serv -1.154071066 7.6527703 6.40478480
## Sales -3.099394496 4.4816547 2.81916174
## Tech.support 3.615391280 15.3277499 15.12281346
## Transport.moving 3.540878339 4.1488966 6.77990757
## Not.in.family 4.560014618 2.4358485 7.83037710
## Other.relative -2.733742632 9.1867657 7.43418019
## Own.child 4.746791011 3.7476768 8.50872528
## Unmarried -0.964675986 -0.7981551 -1.19627697
## Wife 3.439577110 15.5303412 5.61330163
## Asian.Pac.Islander 4.879102564 -0.2367667 4.42374634
## Black 1.388736810 5.7893184 6.44067514
## Other 0.007983821 1.9455020 1.15310924
## White -1.021156086 4.9407330 3.27693944
## Male 9.362764847 1.2051294 11.47695072
## other_countries 3.160924928 0.1052195 2.58749260
## Philippines -2.351414987 2.4893630 0.31596858
## United.States 2.384444995 1.2419303 2.90132520
## MeanDecreaseGini
## age 597.09980328
## education.num 1313.20657000
## capital.gain 1123.93910840
## capital.loss 394.03545718
## hours.per.week 403.69902059
## Local.gov 26.62121430
## No.gain 0.00000000
## Private 34.85345811
## Self.emp.inc 31.42607290
## Self.emp.not.inc 68.42861618
## State.gov 25.36534708
## Married.AF.spouse 5.55656634
## Married.civ.spouse 2290.97129831
## Married.spouse.absent 3.77466788
## Never.married 8.04425050
## Separated 4.45930282
## Widowed 7.00353256
## Armed.Forces 0.00000000
## Craft.repair 27.35518124
## Exec.managerial 95.96340692
## Farming.fishing 36.11000707
## Handlers.cleaners 23.47151631
## Machine.op.inspct 26.91537528
## Other.service 41.44214064
## Priv.house.serv 0.08155885
## Prof.specialty 48.95095629
## Protective.serv 22.82066641
## Sales 35.54876962
## Tech.support 47.94626370
## Transport.moving 31.43454712
## Not.in.family 9.78332366
## Other.relative 8.17089443
## Own.child 7.44808214
## Unmarried 3.26653970
## Wife 42.86895624
## Asian.Pac.Islander 18.52902669
## Black 20.19454254
## Other 6.77053011
## White 24.72990804
## Male 37.61097060
## other_countries 19.37735451
## Philippines 9.10272631
## United.States 16.12389600
\(\\\)
\(\\\)
set.seed(100)
# ** Plot bagged tree
# ** Make predictions on training dataset
bag.rfclass1 <- predict(bag.rforest, traintask)
#Confusion matrix on training dataset
confusionMatrix(bag.rfclass1$data$response, bag.rfclass1$data$truth)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 23414 2203
## >50K 1306 5479
##
## Accuracy : 0.8917
## 95% CI : (0.8883, 0.8951)
## No Information Rate : 0.7629
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6881
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9472
## Specificity : 0.7132
## Pos Pred Value : 0.9140
## Neg Pred Value : 0.8075
## Prevalence : 0.7629
## Detection Rate : 0.7226
## Detection Prevalence : 0.7906
## Balanced Accuracy : 0.8302
##
## 'Positive' Class : <=50K
##
#Make random forest plots on training dataset
plot(bag.rfclass1$data$response, newtrain2$income)
abline(0, 1)
#Training accuracy rate
1 - mean(bag.rfclass1$data$response != newtrain2$income)
## [1] 0.8917042
#Make predictions on test dataset
bag.rfclass2 <- predict(bag.rforest, testtask)
#Confusion matrix on test dataset
confusionMatrix(bag.rfclass2$data$response, bag.rfclass2$data$truth)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11504 1340
## >50K 931 2421
##
## Accuracy : 0.8598
## 95% CI : (0.8543, 0.8651)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5913
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9251
## Specificity : 0.6437
## Pos Pred Value : 0.8957
## Neg Pred Value : 0.7223
## Prevalence : 0.7678
## Detection Rate : 0.7103
## Detection Prevalence : 0.7930
## Balanced Accuracy : 0.7844
##
## 'Positive' Class : <=50K
##
#Make random forest plots on test dataset
plot(bag.rfclass2$data$response, newtest2$income)
abline(0, 1)
#Test accuracy rate
1 - mean(bag.rfclass2$data$response != newtest2$income)
## [1] 0.8597802
\(\\\)
\(\\\)
set.seed(100)
#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Untunned bagged tree model
#Getting predicted >50K of income probabilities
untunned.bag.rf <- mlr::train(bag.rf, traintask)
untunned.bag.rf_prob <- predict(untunned.bag.rf$learner.model,
newdata = newtest2, type = "prob")[, 2]
untunned.bag.rf_prediction <- prediction(untunned.bag.rf_prob,
newtest2$income)
untunned.bag.rf_performance <- ROCR::performance(untunned.bag.rf_prediction,
measure = "tpr",
x.measure = "fpr")
#Plot ROC curve
plot(untunned.bag.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
untunned.bag.rf.auc <- ROCR::performance(untunned.bag.rf_prediction,
measure = "auc")@y.values[[1]]
untunned.bag.rf.auc
## [1] 0.8817957
#=====================================================================
#Tunned bagged tree model
#Getting predicted >50K of income probabilities
tunned.bag.rf_prob <- predict(bag.rforest$learner.model, newdata = newtest2,
type = "prob")[, 2]
tunned.bag.rf_prediction <- prediction(tunned.bag.rf_prob, newtest2$income)
tunned.bag.rf_performance <- ROCR::performance(tunned.bag.rf_prediction,
measure = "tpr",
x.measure = "fpr")
#Plot ROC curve
plot(tunned.bag.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
tunned.bag.rf.auc <- ROCR::performance(tunned.bag.rf_prediction,
measure = "auc")@y.values[[1]]
tunned.bag.rf.auc
## [1] 0.8942506
\(\\\)
\(\\\)
set.seed(100)
#Compare ROC curve
plot(tunned.bag.rf_performance, main = "ROC curve", col = "blue")
plot(untunned.bag.rf_performance, add = TRUE, col = "red")
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Tunned", "Untunned"), col = c("blue", "red"), lwd=3, cex=.8, horiz = TRUE)
#Compare AUC
auc <- data.frame(tunned.bag.rf.auc, untunned.bag.rf.auc)
auc[, order(auc)]
## untunned.bag.rf.auc tunned.bag.rf.auc
## 1 0.8817957 0.8942506
#Pick the model with the largest AUC --> tunned bagged tree
final.auc2 <- bag.rforest$learner.model
\(\\\)
\(\\\)
set.seed(100)
thresholds <- seq(from = 0.001, 0.999, 0.001)
accuracy <- c()
#==================================================================
#Using train dataset to check new accuracy driven by new threshold
untunned.bag.rf_prob.train <- predict(untunned.bag.rf$learner.model,
newdata = newtrain2, type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((untunned.bag.rf_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres1 <- which.max(accuracy) * 0.001
thres1
## [1] 0.46
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(untunned.bag.rf_prob > thres1, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11235 1392
## TRUE 1200 2369
##
## Accuracy : 0.84
## 95% CI : (0.8342, 0.8456)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5431
## Mcnemar's Test P-Value : 0.0001757
##
## Sensitivity : 0.9035
## Specificity : 0.6299
## Pos Pred Value : 0.8898
## Neg Pred Value : 0.6638
## Prevalence : 0.7678
## Detection Rate : 0.6937
## Detection Prevalence : 0.7796
## Balanced Accuracy : 0.7667
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
untunned.bagged.accuracy <- mean((untunned.bag.rf_prob > thres1) == (newtest2$income == ">50K"))
#compare the test accuracy by using default threshold (0.5)
thres.untunned.bag.half <- mean((untunned.bag.rf_prob > 0.5) == (newtest2$income == ">50K"))
#==================================================================
#Using train dataset to check new accuracy driven by new threshold
tunned.bag.rf_prob.train <- predict(bag.rforest$learner.model,
newdata = newtrain2, type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((tunned.bag.rf_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres2 <- which.max(accuracy) * 0.001
thres2
## [1] 0.442
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(tunned.bag.rf_prob > thres2, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11504 1340
## TRUE 931 2421
##
## Accuracy : 0.8598
## 95% CI : (0.8543, 0.8651)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5913
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9251
## Specificity : 0.6437
## Pos Pred Value : 0.8957
## Neg Pred Value : 0.7223
## Prevalence : 0.7678
## Detection Rate : 0.7103
## Detection Prevalence : 0.7930
## Balanced Accuracy : 0.7844
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
tunned.bagged.accuracy <- mean((tunned.bag.rf_prob > thres2) == (newtest2$income == ">50K"))
#compare the test accuracy by using default threshold (0.5)
thres.tunned.bag.half <- mean((tunned.bag.rf_prob > 0.5) == (newtest2$income == ">50K"))
set.seed(100)
#Compare AUC
auc <- data.frame(tunned.bag.rf.auc, untunned.bag.rf.auc)
auc[, order(auc)]
## untunned.bag.rf.auc tunned.bag.rf.auc
## 1 0.8817957 0.8942506
#Pick the model with the largest AUC --> tunned bagged tree
final.auc2 <- bag.rforest$learner.model
#Compare Accuracy - optimal threshold
accuracy.bag.df <- data.frame(tunned.bagged.accuracy,
untunned.bagged.accuracy)
accuracy.bag.df[, order(accuracy.bag.df)]
## untunned.bagged.accuracy tunned.bagged.accuracy
## 1 0.8399605 0.8597802
#Pick the model with the highest Accuracy - tunned.bag.rf.auc
final.thres2 <- bag.rforest$learner.model
#Compare Accuracy - 0.5 threshold
accuracy.bag.df.half <- data.frame(thres.untunned.bag.half,
thres.tunned.bag.half)
accuracy.bag.df.half[, order(accuracy.bag.df.half)]
## thres.untunned.bag.half thres.tunned.bag.half
## 1 0.8449617 0.8620647
#Pick the model with the highest Accuracy - tunned.bag.rf.auc
final.thres2.half <- bag.rforest$learner.model
set.seed(100)
#=============================================================
#Create a task
traintask <- makeClassifTask(data = newtrain2, target = "income", positive = ">50K")
testtask <- makeClassifTask(data = newtest2, target = "income", positive = ">50K")
#Brief view of trainTask
traintask
## Supervised task: newtrain2
## Type: classif
## Target: income
## Observations: 32402
## Features:
## numerics factors ordered
## 43 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Classes: 2
## <=50K >50K
## 24720 7682
## Positive class: >50K
#For deeper View
str(getTaskData(traintask))
## 'data.frame': 32402 obs. of 44 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ Local.gov : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Private : int 0 0 1 1 1 1 1 0 1 1 ...
## $ Self.emp.inc : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Self.emp.not.inc : int 0 1 0 0 0 0 0 1 0 0 ...
## $ State.gov : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Married.AF.spouse : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Married.civ.spouse : int 0 1 0 1 1 1 0 1 0 1 ...
## $ Married.spouse.absent: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Never.married : int 1 0 0 0 0 0 0 0 1 0 ...
## $ Separated : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Widowed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Armed.Forces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Craft.repair : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Exec.managerial : int 0 1 0 0 0 1 0 1 0 1 ...
## $ Farming.fishing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Handlers.cleaners : int 0 0 1 1 0 0 0 0 0 0 ...
## $ Machine.op.inspct : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Other.service : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Priv.house.serv : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Prof.specialty : int 0 0 0 0 1 0 0 0 1 0 ...
## $ Protective.serv : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sales : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tech.support : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Transport.moving : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Not.in.family : int 1 0 1 0 0 0 1 0 1 0 ...
## $ Other.relative : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Own.child : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Unmarried : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Wife : int 0 0 0 0 1 1 0 0 0 0 ...
## $ Asian.Pac.Islander : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : int 0 0 0 1 1 0 1 0 0 0 ...
## $ Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ White : int 1 1 1 0 0 1 0 1 1 1 ...
## $ Male : int 1 1 1 1 0 0 0 1 0 1 ...
## $ other_countries : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Philippines : int 0 0 0 0 0 0 0 0 0 0 ...
## $ United.States : int 1 1 1 1 0 1 0 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
#=============================================================
#Make a random forest learner
rf <- makeLearner("classif.randomForest", predict.type = "response",
par.vals = list(ntree = 50L, importance = TRUE))
#To check the performance, set up a validation strategy
#set 3 fold cross validation
rdesc <- makeResampleDesc("CV", iters = 3L)
r2 <- resample(learner = rf, task = traintask, resampling = rdesc,
measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.632,fpr.test.mean=0.0573,fnr.test.mean=0.368,tnr.test.mean=0.943,acc.test.mean=0.868
## [Resample] cross-validation iter 2: tpr.test.mean=0.635,fpr.test.mean=0.0603,fnr.test.mean=0.365,tnr.test.mean=0.94,acc.test.mean=0.869
## [Resample] cross-validation iter 3: tpr.test.mean=0.602,fpr.test.mean=0.0581,fnr.test.mean=0.398,tnr.test.mean=0.942,acc.test.mean=0.861
## [Resample] Aggr. Result: tpr.test.mean=0.623,fpr.test.mean=0.0586,fnr.test.mean=0.377,tnr.test.mean=0.941,acc.test.mean=0.866
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r2
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.623,fpr.test.mean=0.0586,fnr.test.mean=0.377,tnr.test.mean=0.941,acc.test.mean=0.866
## Runtime: 49.0401
#Aggr. Result: tpr.test.mean=0.623,fpr.test.mean=0.0598,fnr.test.mean=0.377,tnr.test.mean=0.94,acc.test.mean=0.865
#Internally, random forest uses a cutoff of 0.5 -->
#if a particular unseen observation has a probability higher than 0.5, it will be classified as >50K.
#In random forest, we have the option to customize the internal cutoff. As the false negative rate is very high now, we'll increase the cutoff for negative classes (<=50K) and accordingly reduce it for positive classes (>50K). Then, train the model again.
#Evaluating by using new cutoff
rf$par.vals <- list(ntree = 50L, importance = TRUE, cutoff = c(0.53, 0.47))
r3 <- resample(learner = rf, task = traintask, resampling = rdesc,
measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.624,fpr.test.mean=0.0647,fnr.test.mean=0.376,tnr.test.mean=0.935,acc.test.mean=0.862
## [Resample] cross-validation iter 2: tpr.test.mean=0.651,fpr.test.mean=0.0644,fnr.test.mean=0.349,tnr.test.mean=0.936,acc.test.mean=0.868
## [Resample] cross-validation iter 3: tpr.test.mean=0.66,fpr.test.mean=0.068,fnr.test.mean=0.34,tnr.test.mean=0.932,acc.test.mean=0.867
## [Resample] Aggr. Result: tpr.test.mean=0.645,fpr.test.mean=0.0657,fnr.test.mean=0.355,tnr.test.mean=0.934,acc.test.mean=0.866
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r3
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.645,fpr.test.mean=0.0657,fnr.test.mean=0.355,tnr.test.mean=0.934,acc.test.mean=0.866
## Runtime: 47.6331
#Aggr. Result: tpr.test.mean=0.651,fpr.test.mean=0.0683,fnr.test.mean=0.349,tnr.test.mean=0.932,acc.test.mean=0.865 ---> we can see that false negative rate is decreased even though the accuracy rate stays the same. I have tried cutoff = c(0.6, 0.4), cutoff = c(0.7, 0.3) but they all gave lower accuracy late.
#========================================================================
#Random Forest tuning
#Train a old untunned model
untunnedforest <- mlr::train(rf, traintask)
#Let's see how the test classification error changes as we increase the number of trees for untunned model ( #number of trees VS test classification error)
rf.untunned_ind <- predict(untunnedforest$learner.model, newtrain2,
predict.all = T)$individual
head(rf.untunned_ind,2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" ">50K"
## [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K"
## [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" ">50K"
## [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" ">50K"
## [,46] [,47] [,48] [,49] [,50]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K" "<=50K" ">50K" ">50K" ">50K"
n <- dim(rf.untunned_ind)[1]
m <- dim(rf.untunned_ind)[2] / 2
predicted_ind <- c()
misclass.ind <- c()
for(i in 1:m){ # number of tree
for(j in 1:n){
predicted_ind[j] <- names(which.max(table(rf.untunned_ind[j, 1:i*2-1])))
}
misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}
rf.untunned.df <- data.frame(misclass.ind, ntree = seq(1, 49, 2))
ggplot(rf.untunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
ggtitle("Number of trees vs Misclassification rate in training dataset - untunned random forest model")
#======================== Let's actually tune the hyperparameters
getParamSet(rf)
## Type len Def Constr Req Tunable Trafo
## ntree integer - 500 1 to Inf - TRUE -
## mtry integer - - 1 to Inf - TRUE -
## replace logical - TRUE - - TRUE -
## classwt numericvector <NA> - 0 to Inf - TRUE -
## cutoff numericvector <NA> - 0 to 1 - TRUE -
## strata untyped - - - - FALSE -
## sampsize integervector <NA> - 1 to Inf - TRUE -
## nodesize integer - 1 1 to Inf - TRUE -
## maxnodes integer - - 1 to Inf - TRUE -
## importance logical - FALSE - - TRUE -
## localImp logical - FALSE - - TRUE -
## proximity logical - FALSE - - FALSE -
## oob.prox logical - - - Y FALSE -
## norm.votes logical - TRUE - - FALSE -
## do.trace logical - FALSE - - FALSE -
## keep.forest logical - TRUE - - FALSE -
## keep.inbag logical - FALSE - - FALSE -
#Specifying the search space for hyperparameters
rf_params <- makeParamSet(makeIntegerParam("mtry", lower = 2, upper = 10),
makeIntegerParam("nodesize", lower = 10, upper = 50),
makeIntegerParam("ntree", lower = 3, upper = 100)
)
#Set validation strategy
rdesc <- makeResampleDesc("CV", iters = 3L)
#Set optimization technique
rf_ctrl <- makeTuneControlRandom(maxit = 5L)
#Start Hypertuning the parameters
rf_tune <- tuneParams(learner = rf, task = traintask, resampling = rdesc,
measures = list(acc), par.set = rf_params,
control = rf_ctrl, show.info = TRUE)
## [Tune] Started tuning learner classif.randomForest for parameter set:
## Type len Def Constr Req Tunable Trafo
## mtry integer - - 2 to 10 - TRUE -
## nodesize integer - - 10 to 50 - TRUE -
## ntree integer - - 3 to 100 - TRUE -
## With control class: TuneControlRandom
## Imputation value: -0
## [Tune-x] 1: mtry=8; nodesize=14; ntree=79
## [Tune-y] 1: acc.test.mean=0.866; time: 1.2 min
## [Tune-x] 2: mtry=6; nodesize=23; ntree=31
## [Tune-y] 2: acc.test.mean=0.865; time: 0.5 min
## [Tune-x] 3: mtry=3; nodesize=12; ntree=18
## [Tune-y] 3: acc.test.mean=0.858; time: 0.2 min
## [Tune-x] 4: mtry=3; nodesize=17; ntree=59
## [Tune-y] 4: acc.test.mean=0.858; time: 0.8 min
## [Tune-x] 5: mtry=5; nodesize=12; ntree=4
## [Tune-y] 5: acc.test.mean=0.852; time: 0.1 min
## [Tune] Result: mtry=8; nodesize=14; ntree=79 : acc.test.mean=0.866
#Optimal hypertuned parameters
rf_tune$x
## $mtry
## [1] 8
##
## $nodesize
## [1] 14
##
## $ntree
## [1] 79
#Accuracy rate from Cross Validation
rf_tune$y
## acc.test.mean
## 0.8660267
#Use hyperparameters for modeling
rf_tree <- setHyperPars(rf, par.vals = rf_tune$x)
#Train a model
rforest <- mlr::train(rf_tree, traintask)
getLearnerModel(rforest)
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 79L, importance = TRUE, mtry = 8L, nodesize = 14L)
## Type of random forest: classification
## Number of trees: 79
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 13.47%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 23063 1657 0.06703074
## >50K 2709 4973 0.35264254
#========================================================================
#Let's see how the test classification error changes as we increase the number of trees for tunned model ( #number of trees VS test classification error)
rf.tunned_ind <- predict(rforest$learner.model, newtrain2,
predict.all = T)$individual
head(rf.tunned_ind,2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" ">50K" ">50K" ">50K" ">50K" "<=50K" "<=50K" ">50K"
## [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K"
## [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K"
## [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" ">50K" "<=50K" ">50K" ">50K" "<=50K" ">50K" ">50K" ">50K"
## [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,46] [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
## [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62] [,63]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K"
## [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
## [,73] [,74] [,75] [,76] [,77] [,78] [,79]
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" ">50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K"
n <- dim(rf.tunned_ind)[1]
m <- ceiling(dim(rf.tunned_ind)[2] / 2)
predicted_ind <- c()
misclass.ind <- c()
for(i in 1:m){ # number of tree
for(j in 1:n){
predicted_ind[j] <- names(which.max(table(rf.tunned_ind[j, 1:i*2-1])))
}
misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}
rf.tunned.df <- data.frame(misclass.ind, ntree = seq(1, 80, 2))
ggplot(rf.untunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
ggtitle("Number of trees vs Misclassification rate in training dataset - tunned random forest model")
#========================================================================
#***Make plots for random forest model
#Variable importance statistics
varImpPlot(rforest$learner.model)
importance(rforest$learner.model)
## <=50K >50K MeanDecreaseAccuracy
## age -4.5086634 34.16525780 36.4804145
## education.num 19.7742328 35.71472197 46.6996346
## capital.gain 52.7119955 77.24609078 73.3586739
## capital.loss 29.9711481 41.06776776 43.1336331
## hours.per.week -0.4312341 29.02454045 25.6659384
## Local.gov 5.1492927 0.56298571 5.8821188
## No.gain -0.8184111 1.00638984 -0.3717060
## Private 11.1016368 -2.23975243 10.5785695
## Self.emp.inc 1.1277732 5.82510793 7.7132744
## Self.emp.not.inc 15.5469946 -2.74505347 14.3523579
## State.gov 10.8733163 -4.28415056 6.9979343
## Married.AF.spouse -3.7189943 4.02124089 -2.0878889
## Married.civ.spouse 6.2045729 40.26237708 29.5940091
## Married.spouse.absent -2.8969481 1.41648079 -1.9070204
## Never.married -5.3562336 7.60306255 8.4697272
## Separated -1.7153316 2.95840510 2.3026940
## Widowed 0.5964076 2.59756458 1.8897186
## Armed.Forces 0.0000000 0.00000000 0.0000000
## Craft.repair 5.4840128 3.58903776 10.4377926
## Exec.managerial 5.3670247 19.21079683 21.5580767
## Farming.fishing 9.9615691 6.73540097 13.6178460
## Handlers.cleaners 0.3974181 9.37932930 9.3095290
## Machine.op.inspct 3.6133410 6.91776737 9.1384941
## Other.service -7.4759085 15.10751217 15.8621146
## Priv.house.serv -0.8300094 2.45121607 1.2156159
## Prof.specialty 7.6140905 11.38433854 14.9140532
## Protective.serv -4.2827246 8.95857684 5.1430640
## Sales -2.3749483 6.05115849 5.6645534
## Tech.support 2.1784476 16.92936100 12.9507474
## Transport.moving 4.7305436 2.42871893 7.5258776
## Not.in.family -1.1500949 9.19151189 10.7172849
## Other.relative -2.1896172 8.63515706 8.0793463
## Own.child 0.5788207 7.71050918 7.8753397
## Unmarried -2.0795340 4.76099939 4.1781013
## Wife 1.6250141 8.78884381 7.8250190
## Asian.Pac.Islander 2.5150741 -0.07866057 2.9269550
## Black 3.6337768 2.12945346 5.1723703
## Other -0.5773645 1.53909851 0.8206658
## White 3.9022122 4.50106434 6.7822017
## Male 5.2159776 6.08248030 13.1814993
## other_countries 4.9064107 0.58701542 5.2145352
## Philippines -2.5330590 4.79910859 2.6692177
## United.States 2.0788279 3.74229678 4.8721262
## MeanDecreaseGini
## age 739.84239923
## education.num 1069.23964617
## capital.gain 1184.14284781
## capital.loss 373.78120327
## hours.per.week 497.04494609
## Local.gov 34.45943727
## No.gain 0.39987285
## Private 61.12777811
## Self.emp.inc 47.35922677
## Self.emp.not.inc 66.06778851
## State.gov 31.22933659
## Married.AF.spouse 5.71895825
## Married.civ.spouse 1257.57544070
## Married.spouse.absent 7.16077121
## Never.married 270.22189400
## Separated 13.06962836
## Widowed 11.51838286
## Armed.Forces 0.04649277
## Craft.repair 38.07298242
## Exec.managerial 202.26249938
## Farming.fishing 57.51966059
## Handlers.cleaners 28.50154305
## Machine.op.inspct 31.39069988
## Other.service 73.15964271
## Priv.house.serv 1.04174638
## Prof.specialty 125.41528470
## Protective.serv 23.02565963
## Sales 41.36054495
## Tech.support 50.74963135
## Transport.moving 32.46031537
## Not.in.family 162.93055631
## Other.relative 12.64898998
## Own.child 75.38402000
## Unmarried 52.33314608
## Wife 73.94834919
## Asian.Pac.Islander 21.75418947
## Black 28.86962567
## Other 7.23911786
## White 37.00812346
## Male 127.51952291
## other_countries 26.08404616
## Philippines 8.72432373
## United.States 36.12278055
set.seed(100)
# ** Plot (top) subset of random forest tree
plot(rforest$learner.model)
#getTree(rforest$learner.model, k = 10, labelVar = TRUE)
# ** Make predictions on training dataset
rfclass1 <- predict(rforest, traintask)
#Confusion matrix on training dataset
confusionMatrix(rfclass1$data$response, rfclass1$data$truth)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 23561 2189
## >50K 1159 5493
##
## Accuracy : 0.8967
## 95% CI : (0.8933, 0.9)
## No Information Rate : 0.7629
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7005
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9531
## Specificity : 0.7150
## Pos Pred Value : 0.9150
## Neg Pred Value : 0.8258
## Prevalence : 0.7629
## Detection Rate : 0.7271
## Detection Prevalence : 0.7947
## Balanced Accuracy : 0.8341
##
## 'Positive' Class : <=50K
##
#Make random forest plots on training dataset
plot(rfclass1$data$response, newtrain2$income)
abline(0, 1)
#Training accuracy rate
1 - mean(rfclass1$data$response != newtrain2$income)
## [1] 0.896673
#Make predictions on test dataset
rfclass2 <- predict(rforest, testtask)
#Confusion matrix on test dataset
confusionMatrix(rfclass2$data$response, rfclass2$data$truth)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11601 1362
## >50K 834 2399
##
## Accuracy : 0.8644
## 95% CI : (0.859, 0.8696)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6002
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9329
## Specificity : 0.6379
## Pos Pred Value : 0.8949
## Neg Pred Value : 0.7420
## Prevalence : 0.7678
## Detection Rate : 0.7163
## Detection Prevalence : 0.8004
## Balanced Accuracy : 0.7854
##
## 'Positive' Class : <=50K
##
#Make random forest plots on test dataset
plot(rfclass2$data$response, newtest2$income)
abline(0,1)
#Test accuracy rate
1 - mean(rfclass2$data$response != newtest2$income)
## [1] 0.864411
set.seed(100)
#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Untunned random forest model
#Getting predicted >50K of income probabilities
untunned.forest <- mlr::train(rf, traintask)
untunned.rf_prob <- predict(untunned.forest$learner.model,
newdata = newtest2, type = "prob")[, 2]
untunned.rf_prediction <- prediction(untunned.rf_prob, newtest2$income)
untunned.rf_performance <- ROCR::performance(untunned.rf_prediction, measure = "tpr", x.measure = "fpr")
#Plot ROC curve
plot(untunned.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
untunned.rf.auc <- ROCR::performance(untunned.rf_prediction,
measure = "auc")@y.values[[1]]
untunned.rf.auc
## [1] 0.8867613
#=====================================================================
#Tunned random forest model
#Getting predicted >50K of income probabilities
tunned.rf_prob <- predict(rforest$learner.model, newdata = newtest2,
type = "prob")[, 2]
tunned.rf_prediction <- prediction(tunned.rf_prob, newtest2$income)
tunned.rf_performance <- ROCR::performance(tunned.rf_prediction, measure = "tpr", x.measure = "fpr")
#Plot ROC curve
plot(tunned.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
tunned.rf.auc <- ROCR::performance(tunned.rf_prediction,
measure="auc")@y.values[[1]]
tunned.rf.auc
## [1] 0.8962369
\(\\\)
\(\\\)
set.seed(100)
#Compare ROC curve
plot(tunned.rf_performance, main = "ROC curve", col = "blue")
plot(untunned.rf_performance, add = TRUE, col = "red")
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Tunned", "Untunned"), col = c("blue", "red"), lwd=3, cex=.8, horiz = TRUE)
\(\\\)
\(\\\)
set.seed(100)
thresholds <- seq(from = 0.001, 0.999, 0.001)
accuracy <- c()
#==================================================================
#Using train dataset to check new accuracy driven by new threshold
untunned.rf_prob.train <- predict(untunned.forest$learner.model,
newdata = newtrain2, type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((untunned.rf_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres1 <- which.max(accuracy) * 0.001
thres1
## [1] 0.32
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(untunned.rf_prob > thres1, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11192 1104
## TRUE 1243 2657
##
## Accuracy : 0.8551
## 95% CI : (0.8496, 0.8605)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5988
## Mcnemar's Test P-Value : 0.004392
##
## Sensitivity : 0.9000
## Specificity : 0.7065
## Pos Pred Value : 0.9102
## Neg Pred Value : 0.6813
## Prevalence : 0.7678
## Detection Rate : 0.6910
## Detection Prevalence : 0.7592
## Balanced Accuracy : 0.8033
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
rf.untunned.accuracy <- mean((untunned.rf_prob > thres1) == (newtest2$income == ">50K"))
#compare the test accuracy by using default threshold (0.5)
rf.untunned.accuracy.half <- mean((untunned.rf_prob > 0.5) == (newtest2$income == ">50K"))
#==================================================================
#Using train dataset to check new accuracy driven by new threshold
tunned.rf_prob.train <- predict(rforest$learner.model,
newdata = newtrain2, type = "prob")[, 2]
#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
accuracy[i] <- mean((tunned.rf_prob.train > thresholds[i]) ==
(newtrain2$income == ">50K"))
}
#Threshold which give maximum accuracy
thres2 <- which.max(accuracy) * 0.001
thres2
## [1] 0.406
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()
#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(tunned.rf_prob > thres2, newtest2$income == ">50K")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 11436 1257
## TRUE 999 2504
##
## Accuracy : 0.8607
## 95% CI : (0.8553, 0.866)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5998
## Mcnemar's Test P-Value : 0.00000006273
##
## Sensitivity : 0.9197
## Specificity : 0.6658
## Pos Pred Value : 0.9010
## Neg Pred Value : 0.7148
## Prevalence : 0.7678
## Detection Rate : 0.7061
## Detection Prevalence : 0.7837
## Balanced Accuracy : 0.7927
##
## 'Positive' Class : FALSE
##
#Test accuracy rate by using optimal threshold
rf.tunned.accuracy <- mean((tunned.rf_prob > thres2) == (newtest2$income == ">50K"))
#compare the test accuracy by using default threshold (0.5)
rf.tunned.accuracy.half <- mean((tunned.rf_prob > 0.5) == (newtest2$income == ">50K"))
set.seed(100)
#Compare AUC
auc <- data.frame(tunned.rf.auc, untunned.rf.auc)
auc[, order(auc)]
## untunned.rf.auc tunned.rf.auc
## 1 0.8867613 0.8962369
#Pick the model with the largest AUC
final.auc3 <- rforest$learner.model
#Compare Accuracy - optimal threshold
accuracy.random.df <- data.frame(rf.tunned.accuracy, rf.untunned.accuracy)
accuracy.random.df[, order(accuracy.random.df)]
## rf.untunned.accuracy rf.tunned.accuracy
## 1 0.8550877 0.8607063
#Pick the model with the highest Accuracy
final.thres3 <- rforest$learner.model
#Compare Accuracy - default threshold(0.5)
accuracy.random.df.half <- data.frame(rf.tunned.accuracy.half,
rf.untunned.accuracy.half)
accuracy.random.df.half[, order(accuracy.random.df.half)]
## rf.tunned.accuracy.half rf.untunned.accuracy.half
## 1 0.8647814 0.866078
#Pick the model with the largest Accuracy
final.thres3.half <- untunned.forest$learner.model
set.seed(100)
#Change to binary digit
combined <- rbind(newtrain2, newtest2)
combined$income <- as.numeric(combined$income) - 1
#First model
boosting1 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
interaction.depth = 5)
summary(boosting1)
## var rel.inf
## Married.civ.spouse Married.civ.spouse 38.1837367227
## education.num education.num 20.9431635081
## capital.gain capital.gain 19.4087323351
## age age 5.9208507786
## capital.loss capital.loss 5.6876403549
## hours.per.week hours.per.week 4.1947843573
## Exec.managerial Exec.managerial 2.1507741422
## Prof.specialty Prof.specialty 0.7081632260
## Farming.fishing Farming.fishing 0.6308016194
## Other.service Other.service 0.4673220145
## Wife Wife 0.4662046626
## Self.emp.not.inc Self.emp.not.inc 0.4328393928
## Tech.support Tech.support 0.2974840537
## Male Male 0.2127979776
## Self.emp.inc Self.emp.inc 0.0755118841
## Sales Sales 0.0653378908
## Machine.op.inspct Machine.op.inspct 0.0257029903
## Handlers.cleaners Handlers.cleaners 0.0256512584
## Married.AF.spouse Married.AF.spouse 0.0189114782
## White White 0.0172110098
## Local.gov Local.gov 0.0118613048
## Protective.serv Protective.serv 0.0118433164
## Not.in.family Not.in.family 0.0098799096
## Never.married Never.married 0.0091057857
## Own.child Own.child 0.0083974189
## United.States United.States 0.0068051014
## Philippines Philippines 0.0026801348
## other_countries other_countries 0.0017482677
## Transport.moving Transport.moving 0.0012086023
## Private Private 0.0011931320
## Unmarried Unmarried 0.0007028850
## Asian.Pac.Islander Asian.Pac.Islander 0.0003466238
## State.gov State.gov 0.0003137560
## Black Black 0.0002921043
## No.gain No.gain 0.0000000000
## Married.spouse.absent Married.spouse.absent 0.0000000000
## Separated Separated 0.0000000000
## Widowed Widowed 0.0000000000
## Armed.Forces Armed.Forces 0.0000000000
## Craft.repair Craft.repair 0.0000000000
## Priv.house.serv Priv.house.serv 0.0000000000
## Other.relative Other.relative 0.0000000000
## Other Other 0.0000000000
varImp(boosting1, numTrees = 5000)
## Overall
## age 41799.956531
## education.num 147854.313003
## capital.gain 137021.552859
## capital.loss 40153.540174
## hours.per.week 29614.291991
## Local.gov 83.738308
## No.gain 0.000000
## Private 8.423260
## Self.emp.inc 533.097960
## Self.emp.not.inc 3055.754736
## State.gov 2.215051
## Married.AF.spouse 133.511044
## Married.civ.spouse 269569.120198
## Married.spouse.absent 0.000000
## Never.married 64.284925
## Separated 0.000000
## Widowed 0.000000
## Armed.Forces 0.000000
## Craft.repair 0.000000
## Exec.managerial 15184.011389
## Farming.fishing 4453.326263
## Handlers.cleaners 181.092469
## Machine.op.inspct 181.457685
## Other.service 3299.194765
## Priv.house.serv 0.000000
## Prof.specialty 4999.482874
## Protective.serv 83.611314
## Sales 461.271715
## Tech.support 2100.174616
## Transport.moving 8.532477
## Not.in.family 69.750076
## Other.relative 0.000000
## Own.child 59.284005
## Unmarried 4.962219
## Wife 3291.306496
## Asian.Pac.Islander 2.447091
## Black 2.062194
## Other 0.000000
## White 121.506095
## Male 1502.308798
## other_countries 12.342401
## Philippines 18.921186
## United.States 48.042579
#Test error of the first model
set.seed(100)
testerror1 <- c()
thresh <- 0.5
for(i in 1:500){
#If I do not type = "response", they will give you logit output.
yhat <- predict(boosting1, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
testerror1[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror1)
#ROC curve - testing
pos1 <- c()
pos1 <- predict(boosting1, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
predicts1 <- prediction(pos1, combined[32403:48598, 44])
roc1 <- ROCR::performance(predicts1, measure = "tpr", x.measure = "fpr")
plot(roc1)
abline(0, 1, col = "red")
auc1 <- ROCR::performance(predicts1, measure = "auc")
auc1@y.values
## [[1]]
## [1] 0.913919
#Train error of the first model
set.seed(100)
trainerror1 <- c()
thresh <- 0.5
for(i in 1:500){
yhat <- predict(boosting1, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
trainerror1[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror1)
#ROC curve - training
pos1b <- c()
pos1b <- predict(boosting1, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts1b <- prediction(pos1b, combined[1:32402, 44])
roc1b <- ROCR::performance(predicts1b, measure = "tpr", x.measure = "fpr")
plot(roc1b)
abline(0, 1, col = "red")
auc1b <- ROCR::performance(predicts1b, measure = "auc")
auc1b@y.values
## [[1]]
## [1] 0.9161409
#Second model
set.seed(100)
boosting2 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 2000,
interaction.depth = 5)
summary(boosting2)
## var rel.inf
## Married.civ.spouse Married.civ.spouse 43.183739803
## education.num education.num 22.269924180
## capital.gain capital.gain 20.182879377
## age age 4.792613523
## capital.loss capital.loss 4.080112884
## hours.per.week hours.per.week 3.115147219
## Exec.managerial Exec.managerial 1.571469479
## Prof.specialty Prof.specialty 0.461127929
## Farming.fishing Farming.fishing 0.183178723
## Self.emp.not.inc Self.emp.not.inc 0.063849502
## Other.service Other.service 0.042141531
## Wife Wife 0.020667103
## Tech.support Tech.support 0.020408954
## Male Male 0.005805212
## Sales Sales 0.002968312
## Self.emp.inc Self.emp.inc 0.002497470
## Never.married Never.married 0.001468800
## Local.gov Local.gov 0.000000000
## No.gain No.gain 0.000000000
## Private Private 0.000000000
## State.gov State.gov 0.000000000
## Married.AF.spouse Married.AF.spouse 0.000000000
## Married.spouse.absent Married.spouse.absent 0.000000000
## Separated Separated 0.000000000
## Widowed Widowed 0.000000000
## Armed.Forces Armed.Forces 0.000000000
## Craft.repair Craft.repair 0.000000000
## Handlers.cleaners Handlers.cleaners 0.000000000
## Machine.op.inspct Machine.op.inspct 0.000000000
## Priv.house.serv Priv.house.serv 0.000000000
## Protective.serv Protective.serv 0.000000000
## Transport.moving Transport.moving 0.000000000
## Not.in.family Not.in.family 0.000000000
## Other.relative Other.relative 0.000000000
## Own.child Own.child 0.000000000
## Unmarried Unmarried 0.000000000
## Asian.Pac.Islander Asian.Pac.Islander 0.000000000
## Black Black 0.000000000
## Other Other 0.000000000
## White White 0.000000000
## other_countries other_countries 0.000000000
## Philippines Philippines 0.000000000
## United.States United.States 0.000000000
varImp(boosting2, numTrees = 2000)
## Overall
## age 29254.355185
## education.num 135936.742808
## capital.gain 123197.315843
## capital.loss 24905.215267
## hours.per.week 19015.016079
## Local.gov 0.000000
## No.gain 0.000000
## Private 0.000000
## Self.emp.inc 15.244682
## Self.emp.not.inc 389.740585
## State.gov 0.000000
## Married.AF.spouse 0.000000
## Married.civ.spouse 263595.730442
## Married.spouse.absent 0.000000
## Never.married 8.965628
## Separated 0.000000
## Widowed 0.000000
## Armed.Forces 0.000000
## Craft.repair 0.000000
## Exec.managerial 9592.329127
## Farming.fishing 1118.132183
## Handlers.cleaners 0.000000
## Machine.op.inspct 0.000000
## Other.service 257.234036
## Priv.house.serv 0.000000
## Prof.specialty 2814.748184
## Protective.serv 0.000000
## Sales 18.118727
## Tech.support 124.577289
## Transport.moving 0.000000
## Not.in.family 0.000000
## Other.relative 0.000000
## Own.child 0.000000
## Unmarried 0.000000
## Wife 126.153040
## Asian.Pac.Islander 0.000000
## Black 0.000000
## Other 0.000000
## White 0.000000
## Male 35.435306
## other_countries 0.000000
## Philippines 0.000000
## United.States 0.000000
#Test error of the second model
set.seed(100)
testerror2 <- c()
thresh <- 0.5
for(i in 1:200){
yhat <- predict(boosting2, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
testerror2[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror2)
#ROC curve - testing
pos2 <- c()
pos2 <- predict(boosting2, newdata = combined[32403:48598, -44], n.trees = 2000, type = "response")
predicts2 <- prediction(pos2, combined[32403:48598, 44])
roc2 <- ROCR::performance(predicts2, measure = "tpr", x.measure = "fpr")
plot(roc2)
abline(0, 1, col = "red")
auc2 <- ROCR::performance(predicts2, measure = "auc")
auc2@y.values
## [[1]]
## [1] 0.9026846
#Train error of the second model
set.seed(100)
trainerror2 <- c()
thresh <- 0.5
for(i in 1:200){
yhat <- predict(boosting2, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
trainerror2[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror2)
#ROC curve - training
pos2b <- c()
pos2b <- predict(boosting2, newdata = combined[1:32402, -44], n.trees = 2000, type = "response")
predicts2b <- prediction(pos2b, combined[1:32402, 44])
roc2b <- ROCR::performance(predicts2b, measure = "tpr", x.measure = "fpr")
plot(roc2b)
abline(0, 1, col = "red")
auc2b <- ROCR::performance(predicts2b, measure = "auc")
auc2b@y.values
## [[1]]
## [1] 0.9032523
#Third model
set.seed(100)
boosting3 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
interaction.depth = 3)
summary(boosting3)
## var rel.inf
## Married.civ.spouse Married.civ.spouse 38.4653932551
## capital.gain capital.gain 21.5925456394
## education.num education.num 20.6266997102
## age age 5.9262919869
## capital.loss capital.loss 4.9121468429
## hours.per.week hours.per.week 4.1693779643
## Exec.managerial Exec.managerial 2.1015494026
## Prof.specialty Prof.specialty 0.5954335020
## Farming.fishing Farming.fishing 0.5462841653
## Other.service Other.service 0.3328796337
## Wife Wife 0.2756071261
## Self.emp.not.inc Self.emp.not.inc 0.2224774222
## Tech.support Tech.support 0.1327361634
## Male Male 0.0496445933
## Self.emp.inc Self.emp.inc 0.0362639675
## Sales Sales 0.0067688417
## Married.AF.spouse Married.AF.spouse 0.0028144804
## Not.in.family Not.in.family 0.0022786683
## Local.gov Local.gov 0.0012623225
## United.States United.States 0.0009382238
## White White 0.0006060883
## No.gain No.gain 0.0000000000
## Private Private 0.0000000000
## State.gov State.gov 0.0000000000
## Married.spouse.absent Married.spouse.absent 0.0000000000
## Never.married Never.married 0.0000000000
## Separated Separated 0.0000000000
## Widowed Widowed 0.0000000000
## Armed.Forces Armed.Forces 0.0000000000
## Craft.repair Craft.repair 0.0000000000
## Handlers.cleaners Handlers.cleaners 0.0000000000
## Machine.op.inspct Machine.op.inspct 0.0000000000
## Priv.house.serv Priv.house.serv 0.0000000000
## Protective.serv Protective.serv 0.0000000000
## Transport.moving Transport.moving 0.0000000000
## Other.relative Other.relative 0.0000000000
## Own.child Own.child 0.0000000000
## Unmarried Unmarried 0.0000000000
## Asian.Pac.Islander Asian.Pac.Islander 0.0000000000
## Black Black 0.0000000000
## Other Other 0.0000000000
## other_countries other_countries 0.0000000000
## Philippines Philippines 0.0000000000
varImp(boosting3, numTrees = 5000)
## Overall
## age 39889.279469
## education.num 138836.255637
## capital.gain 145337.268121
## capital.loss 33063.169794
## hours.per.week 28063.666657
## Local.gov 8.496567
## No.gain 0.000000
## Private 0.000000
## Self.emp.inc 244.089143
## Self.emp.not.inc 1497.473309
## State.gov 0.000000
## Married.AF.spouse 18.943987
## Married.civ.spouse 258906.720229
## Married.spouse.absent 0.000000
## Never.married 0.000000
## Separated 0.000000
## Widowed 0.000000
## Armed.Forces 0.000000
## Craft.repair 0.000000
## Exec.managerial 14145.319134
## Farming.fishing 3676.984156
## Handlers.cleaners 0.000000
## Machine.op.inspct 0.000000
## Other.service 2240.579568
## Priv.house.serv 0.000000
## Prof.specialty 4007.803433
## Protective.serv 0.000000
## Sales 45.560398
## Tech.support 893.433859
## Transport.moving 0.000000
## Not.in.family 15.337489
## Other.relative 0.000000
## Own.child 0.000000
## Unmarried 0.000000
## Wife 1855.084039
## Asian.Pac.Islander 0.000000
## Black 0.000000
## Other 0.000000
## White 4.079520
## Male 334.152799
## other_countries 0.000000
## Philippines 0.000000
## United.States 6.315091
#Test error of the third model
set.seed(100)
testerror3 <- c()
thresh <- 0.5
for(i in 1:500){
yhat <- predict(boosting3, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
testerror3[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror3)
#ROC curve - testing
pos3 <- c()
pos3 <- predict(boosting3, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
predicts3 <- prediction(pos3, combined[32403:48598, 44])
roc3 <- ROCR::performance(predicts3, measure = "tpr", x.measure = "fpr")
plot(roc3)
abline(0, 1, col = "red")
auc3 <- ROCR::performance(predicts3, measure = "auc")
auc3@y.values
## [[1]]
## [1] 0.9086526
#Train error of the third model
set.seed(100)
trainerror3 <- c()
thresh <- 0.5
for(i in 1:500){
yhat <- predict(boosting3, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
trainerror3[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror3)
#ROC curve - training
pos3b <- c()
pos3b <- predict(boosting3, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts3b <- prediction(pos3b, combined[1:32402, 44])
roc3b <- ROCR::performance(predicts3b, measure = "tpr", x.measure = "fpr")
plot(roc3b)
abline(0, 1, col = "red")
auc3b <- ROCR::performance(predicts3b, measure = "auc")
auc3b@y.values
## [[1]]
## [1] 0.9100084
#Fourth model
set.seed(100)
boosting4 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
interaction.depth = 3, shrinkage = 0.2)
summary(boosting4)
## var rel.inf
## Married.civ.spouse Married.civ.spouse 18.026448560
## age age 16.972500853
## education.num education.num 14.284120321
## hours.per.week hours.per.week 12.663868474
## capital.gain capital.gain 9.733799355
## capital.loss capital.loss 4.043940259
## Exec.managerial Exec.managerial 2.339896499
## Wife Wife 2.312137098
## Self.emp.inc Self.emp.inc 1.477410233
## Sales Sales 1.421837748
## Self.emp.not.inc Self.emp.not.inc 1.363968271
## Prof.specialty Prof.specialty 1.270072643
## Craft.repair Craft.repair 1.217346462
## Private Private 0.979140053
## Local.gov Local.gov 0.962075427
## Tech.support Tech.support 0.956414372
## Transport.moving Transport.moving 0.929744557
## Protective.serv Protective.serv 0.906260989
## Married.AF.spouse Married.AF.spouse 0.847841879
## Male Male 0.735702554
## State.gov State.gov 0.701580425
## Farming.fishing Farming.fishing 0.614732088
## Not.in.family Not.in.family 0.542609381
## White White 0.528052568
## other_countries other_countries 0.515926852
## Asian.Pac.Islander Asian.Pac.Islander 0.511077925
## United.States United.States 0.447415310
## Machine.op.inspct Machine.op.inspct 0.424901024
## Never.married Never.married 0.387397190
## Other.service Other.service 0.383110797
## Black Black 0.366741057
## Philippines Philippines 0.223151705
## Unmarried Unmarried 0.220840811
## Widowed Widowed 0.168091780
## Handlers.cleaners Handlers.cleaners 0.138868006
## Married.spouse.absent Married.spouse.absent 0.137691429
## Separated Separated 0.100353469
## Other Other 0.049692624
## Other.relative Other.relative 0.040436779
## Own.child Own.child 0.038139196
## No.gain No.gain 0.007916710
## Priv.house.serv Priv.house.serv 0.006746267
## Armed.Forces Armed.Forces 0.000000000
varImp(boosting4, numTrees = 5000)
## Overall
## age 1625.9627048
## education.num 1368.4163056
## capital.gain 932.4963283
## capital.loss 387.4087914
## hours.per.week 1213.1964533
## Local.gov 92.1666628
## No.gain 0.7584195
## Private 93.8014511
## Self.emp.inc 141.5356499
## Self.emp.not.inc 130.6679292
## State.gov 67.2112858
## Married.AF.spouse 81.2231083
## Married.civ.spouse 1726.9307165
## Married.spouse.absent 13.1908156
## Never.married 37.1125852
## Separated 9.6138453
## Widowed 16.1031641
## Armed.Forces 0.0000000
## Craft.repair 116.6215847
## Exec.managerial 224.1616879
## Farming.fishing 58.8912299
## Handlers.cleaners 13.3035314
## Machine.op.inspct 40.7054461
## Other.service 36.7019494
## Priv.house.serv 0.6462912
## Prof.specialty 121.6727439
## Protective.serv 86.8196491
## Sales 136.2118152
## Tech.support 91.6243347
## Transport.moving 89.0693709
## Not.in.family 51.9818867
## Other.relative 3.8738366
## Own.child 3.6537285
## Unmarried 21.1565123
## Wife 221.5023421
## Asian.Pac.Islander 48.9611786
## Black 35.1337311
## Other 4.7605449
## White 50.5873465
## Male 70.4801800
## other_countries 49.4257050
## Philippines 21.3778955
## United.States 42.8623109
#Test error of the fourth model
set.seed(100)
testerror4 <- c()
thresh <- 0.5
for(i in 1:500){
yhat <- predict(boosting4, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
testerror4[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror4)
#ROC curve - testing
pos4 <- c()
pos4 <- predict(boosting4, newdata = combined[32403:48598, -44], n.trees = 150, type = "response")
predicts4 <- prediction(pos4, combined[32403:48598, 44])
roc4 <- ROCR::performance(predicts4, measure = "tpr", x.measure = "fpr")
plot(roc4)
abline(0, 1, col = "red")
auc4 <- ROCR::performance(predicts4, measure = "auc")
auc4@y.values
## [[1]]
## [1] 0.9209653
#Train error of the fourth model
set.seed(100)
trainerror4 <- c()
thresh <- 0.5
for(i in 1:500){
yhat <- predict(boosting4, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
trainerror4[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror4)
#ROC curve - training
pos4b <- c()
pos4b <- predict(boosting4, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts4b <- prediction(pos4b, combined[1:32402, 44])
roc4b <- ROCR::performance(predicts4b, measure = "tpr", x.measure = "fpr")
plot(roc4b)
abline(0, 1, col = "red")
auc4b <- ROCR::performance(predicts4b, measure = "auc")
auc4b@y.values
## [[1]]
## [1] 0.9556233
#Fifth model
set.seed(100)
boosting5 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
interaction.depth = 3, shrinkage = 0.1)
summary(boosting5)
## var rel.inf
## Married.civ.spouse Married.civ.spouse 21.749111849
## education.num education.num 15.520807408
## age age 13.841105494
## capital.gain capital.gain 13.128327788
## hours.per.week hours.per.week 10.656667597
## capital.loss capital.loss 5.072048205
## Exec.managerial Exec.managerial 2.049942975
## Wife Wife 1.926472668
## Prof.specialty Prof.specialty 1.288391194
## Self.emp.not.inc Self.emp.not.inc 1.139338856
## Self.emp.inc Self.emp.inc 1.033026573
## Sales Sales 1.017303679
## Private Private 0.863058329
## Married.AF.spouse Married.AF.spouse 0.823091712
## Tech.support Tech.support 0.774572610
## Craft.repair Craft.repair 0.742961349
## Local.gov Local.gov 0.739759906
## Protective.serv Protective.serv 0.687410865
## Transport.moving Transport.moving 0.670745956
## Male Male 0.641166610
## Farming.fishing Farming.fishing 0.618918294
## State.gov State.gov 0.567625482
## Not.in.family Not.in.family 0.463684678
## White White 0.435534037
## Asian.Pac.Islander Asian.Pac.Islander 0.430976991
## Other.service Other.service 0.422750642
## other_countries other_countries 0.353529212
## Machine.op.inspct Machine.op.inspct 0.329448541
## United.States United.States 0.307138100
## Never.married Never.married 0.282278336
## Black Black 0.268746977
## Unmarried Unmarried 0.228712965
## Philippines Philippines 0.222233474
## Widowed Widowed 0.167860893
## Handlers.cleaners Handlers.cleaners 0.156685930
## Married.spouse.absent Married.spouse.absent 0.106604852
## Separated Separated 0.106553495
## Own.child Own.child 0.063757082
## Other Other 0.053894161
## Other.relative Other.relative 0.036158013
## No.gain No.gain 0.007362428
## Priv.house.serv Priv.house.serv 0.004233794
## Armed.Forces Armed.Forces 0.000000000
varImp(boosting5, numTrees = 5000)
## Overall
## age 1885.5216440
## education.num 2114.3411062
## capital.gain 1788.4226230
## capital.loss 690.9460140
## hours.per.week 1451.7176692
## Local.gov 100.7747043
## No.gain 1.0029558
## Private 117.5711839
## Self.emp.inc 140.7253173
## Self.emp.not.inc 155.2078390
## State.gov 77.3254804
## Married.AF.spouse 112.1266822
## Married.civ.spouse 2962.7995501
## Married.spouse.absent 14.5223773
## Never.married 38.4537139
## Separated 14.5153811
## Widowed 22.8670569
## Armed.Forces 0.0000000
## Craft.repair 101.2108249
## Exec.managerial 279.2560067
## Farming.fishing 84.3129070
## Handlers.cleaners 21.3447339
## Machine.op.inspct 44.8795333
## Other.service 57.5897270
## Priv.house.serv 0.5767538
## Prof.specialty 175.5126773
## Protective.serv 93.6433918
## Sales 138.5834467
## Tech.support 105.5171079
## Transport.moving 91.3731940
## Not.in.family 63.1660164
## Other.relative 4.9256698
## Own.child 8.6853870
## Unmarried 31.1567054
## Wife 262.4361121
## Asian.Pac.Islander 58.7103714
## Black 36.6103879
## Other 7.3417984
## White 59.3311606
## Male 87.3437111
## other_countries 48.1599524
## Philippines 30.2740288
## United.States 41.8402660
#Test error of the fifth model
set.seed(100)
testerror5 <- c()
thresh <- 0.5
for(i in 1:500){
yhat <- predict(boosting5, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
testerror5[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror5)
#ROC curve - testing
pos5 <- c()
pos5 <- predict(boosting5, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
predicts5 <- prediction(pos5, combined[32403:48598, 44])
roc5 <- ROCR::performance(predicts5, measure = "tpr", x.measure = "fpr")
plot(roc5)
abline(0, 1, col = "red")
auc5 <- ROCR::performance(predicts5, measure = "auc")
auc5@y.values
## [[1]]
## [1] 0.9231948
#Train error of the fifth model
set.seed(100)
trainerror5 <- c()
thresh <- 0.5
for(i in 1:500){
yhat <- predict(boosting5, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
yhat <- (yhat > thresh)
trainerror5[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror5)
#ROC curve - training
pos5b <- c()
pos5b <- predict(boosting5, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts5b <- prediction(pos5b, combined[1:32402, 44])
roc5b <- ROCR::performance(predicts5b, measure = "tpr", x.measure = "fpr")
plot(roc5b)
abline(0, 1, col = "red")
auc5b <- ROCR::performance(predicts5b, measure = "auc")
auc5b@y.values
## [[1]]
## [1] 0.9496323
#ROC and AUC combined testing
plot(roc1, type = "l", col = "red")
par(new = TRUE)
plot(roc2, type = "l", col = "green")
par(new = TRUE)
plot(roc3, type = "l", col = "blue")
par(new = TRUE)
plot(roc4, type = "l", col = "black")
par(new = TRUE)
plot(roc5, type = "l", col = "yellow",
main = "model1: red, model2: green, model3: blue, model4: black, model5: yellow")
paste("AUC for model 1 is", round(auc1@y.values[[1]], 5))
## [1] "AUC for model 1 is 0.91392"
paste("AUC for model 2 is", round(auc2@y.values[[1]], 5))
## [1] "AUC for model 2 is 0.90268"
paste("AUC for model 3 is", round(auc3@y.values[[1]], 5))
## [1] "AUC for model 3 is 0.90865"
paste("AUC for model 4 is", round(auc4@y.values[[1]], 5))
## [1] "AUC for model 4 is 0.92097"
paste("AUC for model 5 is", round(auc5@y.values[[1]], 5))
## [1] "AUC for model 5 is 0.92319"
#ROC and AUC combined training
plot(roc1b, type = "l", col = "red")
par(new = TRUE)
plot(roc2b, type = "l", col = "green")
par(new = TRUE)
plot(roc3b, type = "l", col = "blue")
par(new = TRUE)
plot(roc4b, type = "l", col = "black")
par(new = TRUE)
plot(roc5b, type = "l", col = "yellow",
main = "model1: red, model2: green, model3: blue, model4: black, model5: yellow")
paste("AUC for model 1 is", round(auc1b@y.values[[1]], 5))
## [1] "AUC for model 1 is 0.91614"
paste("AUC for model 2 is", round(auc2b@y.values[[1]], 5))
## [1] "AUC for model 2 is 0.90325"
paste("AUC for model 3 is", round(auc3b@y.values[[1]], 5))
## [1] "AUC for model 3 is 0.91001"
paste("AUC for model 4 is", round(auc4b@y.values[[1]], 5))
## [1] "AUC for model 4 is 0.95562"
paste("AUC for model 5 is", round(auc5b@y.values[[1]], 5))
## [1] "AUC for model 5 is 0.94963"
#Partial dependence plots
variables <- c("Married.civ.spouse", "education.num", "age", "capital.gain",
"hours.per.week", "capital.loss")
par(mfrow = c(2, 3))
for(i in 1:6){
plot(boosting1, i = variables[i])
}
for(i in 1:6){
plot(boosting2, i = variables[i])
}
for(i in 1:6){
plot(boosting3, i = variables[i])
}
for(i in 1:6){
plot(boosting4, i = variables[i])
}
for(i in 1:6){
plot(boosting5, i = variables[i])
}
#Check imbalance
table(combined$income)
##
## 0 1
## 37155 11443
11443 / 48598 #23.5%
## [1] 0.2354624
37155 / 48598 #76.5%
## [1] 0.7645376
\(\\\)
\(\\\)
set.seed(100)
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
boostingtrain <- caret::train(income~., data = newtrain2, method = "gbm", metric = "Accuracy", trControl = trctrl)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0580 nan 0.1000 0.0185
## 2 1.0284 nan 0.1000 0.0149
## 3 1.0038 nan 0.1000 0.0122
## 4 0.9833 nan 0.1000 0.0100
## 5 0.9639 nan 0.1000 0.0097
## 6 0.9472 nan 0.1000 0.0080
## 7 0.9302 nan 0.1000 0.0083
## 8 0.9174 nan 0.1000 0.0062
## 9 0.9075 nan 0.1000 0.0049
## 10 0.8949 nan 0.1000 0.0063
## 20 0.8154 nan 0.1000 0.0023
## 40 0.7375 nan 0.1000 0.0012
## 60 0.6956 nan 0.1000 0.0010
## 80 0.6715 nan 0.1000 0.0006
## 100 0.6560 nan 0.1000 0.0002
## 120 0.6452 nan 0.1000 0.0001
## 140 0.6367 nan 0.1000 0.0003
## 150 0.6334 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0438 nan 0.1000 0.0260
## 2 1.0038 nan 0.1000 0.0197
## 3 0.9699 nan 0.1000 0.0165
## 4 0.9434 nan 0.1000 0.0132
## 5 0.9141 nan 0.1000 0.0146
## 6 0.8911 nan 0.1000 0.0116
## 7 0.8742 nan 0.1000 0.0082
## 8 0.8586 nan 0.1000 0.0080
## 9 0.8429 nan 0.1000 0.0079
## 10 0.8321 nan 0.1000 0.0054
## 20 0.7506 nan 0.1000 0.0022
## 40 0.6788 nan 0.1000 0.0008
## 60 0.6486 nan 0.1000 0.0004
## 80 0.6321 nan 0.1000 0.0002
## 100 0.6210 nan 0.1000 0.0002
## 120 0.6127 nan 0.1000 0.0001
## 140 0.6062 nan 0.1000 0.0001
## 150 0.6032 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0367 nan 0.1000 0.0296
## 2 0.9886 nan 0.1000 0.0236
## 3 0.9516 nan 0.1000 0.0183
## 4 0.9208 nan 0.1000 0.0154
## 5 0.8950 nan 0.1000 0.0129
## 6 0.8698 nan 0.1000 0.0124
## 7 0.8497 nan 0.1000 0.0100
## 8 0.8337 nan 0.1000 0.0080
## 9 0.8169 nan 0.1000 0.0082
## 10 0.8030 nan 0.1000 0.0070
## 20 0.7199 nan 0.1000 0.0027
## 40 0.6546 nan 0.1000 0.0010
## 60 0.6279 nan 0.1000 0.0002
## 80 0.6132 nan 0.1000 0.0001
## 100 0.6013 nan 0.1000 0.0001
## 120 0.5948 nan 0.1000 0.0002
## 140 0.5893 nan 0.1000 0.0001
## 150 0.5867 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0580 nan 0.1000 0.0187
## 2 1.0279 nan 0.1000 0.0150
## 3 1.0036 nan 0.1000 0.0122
## 4 0.9832 nan 0.1000 0.0104
## 5 0.9636 nan 0.1000 0.0098
## 6 0.9491 nan 0.1000 0.0070
## 7 0.9328 nan 0.1000 0.0083
## 8 0.9173 nan 0.1000 0.0078
## 9 0.9072 nan 0.1000 0.0048
## 10 0.8946 nan 0.1000 0.0062
## 20 0.8117 nan 0.1000 0.0033
## 40 0.7370 nan 0.1000 0.0010
## 60 0.6943 nan 0.1000 0.0007
## 80 0.6707 nan 0.1000 0.0004
## 100 0.6552 nan 0.1000 0.0003
## 120 0.6448 nan 0.1000 0.0001
## 140 0.6365 nan 0.1000 0.0000
## 150 0.6329 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0431 nan 0.1000 0.0257
## 2 1.0030 nan 0.1000 0.0200
## 3 0.9693 nan 0.1000 0.0170
## 4 0.9415 nan 0.1000 0.0137
## 5 0.9126 nan 0.1000 0.0144
## 6 0.8891 nan 0.1000 0.0116
## 7 0.8715 nan 0.1000 0.0090
## 8 0.8541 nan 0.1000 0.0087
## 9 0.8392 nan 0.1000 0.0076
## 10 0.8281 nan 0.1000 0.0056
## 20 0.7486 nan 0.1000 0.0027
## 40 0.6755 nan 0.1000 0.0010
## 60 0.6468 nan 0.1000 0.0005
## 80 0.6319 nan 0.1000 0.0003
## 100 0.6195 nan 0.1000 0.0003
## 120 0.6115 nan 0.1000 0.0003
## 140 0.6053 nan 0.1000 0.0001
## 150 0.6023 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0359 nan 0.1000 0.0296
## 2 0.9895 nan 0.1000 0.0236
## 3 0.9521 nan 0.1000 0.0185
## 4 0.9194 nan 0.1000 0.0157
## 5 0.8942 nan 0.1000 0.0124
## 6 0.8710 nan 0.1000 0.0113
## 7 0.8490 nan 0.1000 0.0109
## 8 0.8308 nan 0.1000 0.0091
## 9 0.8155 nan 0.1000 0.0077
## 10 0.8019 nan 0.1000 0.0067
## 20 0.7189 nan 0.1000 0.0026
## 40 0.6532 nan 0.1000 0.0008
## 60 0.6278 nan 0.1000 0.0002
## 80 0.6123 nan 0.1000 0.0004
## 100 0.6011 nan 0.1000 0.0001
## 120 0.5929 nan 0.1000 0.0001
## 140 0.5870 nan 0.1000 0.0001
## 150 0.5844 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0577 nan 0.1000 0.0186
## 2 1.0270 nan 0.1000 0.0149
## 3 1.0030 nan 0.1000 0.0121
## 4 0.9828 nan 0.1000 0.0097
## 5 0.9629 nan 0.1000 0.0096
## 6 0.9456 nan 0.1000 0.0084
## 7 0.9296 nan 0.1000 0.0077
## 8 0.9168 nan 0.1000 0.0067
## 9 0.9039 nan 0.1000 0.0064
## 10 0.8914 nan 0.1000 0.0062
## 20 0.8145 nan 0.1000 0.0023
## 40 0.7356 nan 0.1000 0.0014
## 60 0.6942 nan 0.1000 0.0010
## 80 0.6687 nan 0.1000 0.0005
## 100 0.6541 nan 0.1000 0.0002
## 120 0.6431 nan 0.1000 0.0001
## 140 0.6353 nan 0.1000 0.0003
## 150 0.6322 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0433 nan 0.1000 0.0257
## 2 1.0031 nan 0.1000 0.0197
## 3 0.9700 nan 0.1000 0.0163
## 4 0.9425 nan 0.1000 0.0136
## 5 0.9135 nan 0.1000 0.0147
## 6 0.8906 nan 0.1000 0.0115
## 7 0.8709 nan 0.1000 0.0097
## 8 0.8562 nan 0.1000 0.0074
## 9 0.8425 nan 0.1000 0.0067
## 10 0.8308 nan 0.1000 0.0056
## 20 0.7479 nan 0.1000 0.0035
## 40 0.6767 nan 0.1000 0.0012
## 60 0.6450 nan 0.1000 0.0005
## 80 0.6295 nan 0.1000 0.0003
## 100 0.6194 nan 0.1000 0.0001
## 120 0.6106 nan 0.1000 0.0001
## 140 0.6040 nan 0.1000 0.0001
## 150 0.6014 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0362 nan 0.1000 0.0293
## 2 0.9906 nan 0.1000 0.0227
## 3 0.9517 nan 0.1000 0.0202
## 4 0.9212 nan 0.1000 0.0154
## 5 0.8957 nan 0.1000 0.0128
## 6 0.8732 nan 0.1000 0.0111
## 7 0.8513 nan 0.1000 0.0110
## 8 0.8333 nan 0.1000 0.0091
## 9 0.8173 nan 0.1000 0.0076
## 10 0.8051 nan 0.1000 0.0059
## 20 0.7203 nan 0.1000 0.0023
## 40 0.6532 nan 0.1000 0.0010
## 60 0.6264 nan 0.1000 0.0003
## 80 0.6106 nan 0.1000 0.0002
## 100 0.5997 nan 0.1000 0.0001
## 120 0.5906 nan 0.1000 0.0003
## 140 0.5850 nan 0.1000 0.0000
## 150 0.5827 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0592 nan 0.1000 0.0186
## 2 1.0298 nan 0.1000 0.0150
## 3 1.0052 nan 0.1000 0.0122
## 4 0.9850 nan 0.1000 0.0096
## 5 0.9653 nan 0.1000 0.0098
## 6 0.9515 nan 0.1000 0.0069
## 7 0.9350 nan 0.1000 0.0084
## 8 0.9193 nan 0.1000 0.0078
## 9 0.9061 nan 0.1000 0.0064
## 10 0.8932 nan 0.1000 0.0065
## 20 0.8137 nan 0.1000 0.0032
## 40 0.7373 nan 0.1000 0.0012
## 60 0.6957 nan 0.1000 0.0005
## 80 0.6710 nan 0.1000 0.0002
## 100 0.6550 nan 0.1000 0.0002
## 120 0.6428 nan 0.1000 0.0001
## 140 0.6359 nan 0.1000 0.0001
## 150 0.6329 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0436 nan 0.1000 0.0257
## 2 1.0042 nan 0.1000 0.0193
## 3 0.9717 nan 0.1000 0.0160
## 4 0.9436 nan 0.1000 0.0143
## 5 0.9146 nan 0.1000 0.0143
## 6 0.8914 nan 0.1000 0.0116
## 7 0.8756 nan 0.1000 0.0081
## 8 0.8602 nan 0.1000 0.0077
## 9 0.8444 nan 0.1000 0.0082
## 10 0.8330 nan 0.1000 0.0056
## 20 0.7507 nan 0.1000 0.0028
## 40 0.6780 nan 0.1000 0.0010
## 60 0.6461 nan 0.1000 0.0004
## 80 0.6305 nan 0.1000 0.0001
## 100 0.6197 nan 0.1000 0.0002
## 120 0.6113 nan 0.1000 0.0002
## 140 0.6053 nan 0.1000 -0.0000
## 150 0.6026 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0358 nan 0.1000 0.0294
## 2 0.9889 nan 0.1000 0.0229
## 3 0.9511 nan 0.1000 0.0187
## 4 0.9214 nan 0.1000 0.0150
## 5 0.8956 nan 0.1000 0.0126
## 6 0.8706 nan 0.1000 0.0125
## 7 0.8491 nan 0.1000 0.0107
## 8 0.8315 nan 0.1000 0.0089
## 9 0.8155 nan 0.1000 0.0077
## 10 0.8008 nan 0.1000 0.0071
## 20 0.7214 nan 0.1000 0.0030
## 40 0.6546 nan 0.1000 0.0007
## 60 0.6268 nan 0.1000 0.0007
## 80 0.6112 nan 0.1000 0.0002
## 100 0.6013 nan 0.1000 0.0000
## 120 0.5946 nan 0.1000 0.0001
## 140 0.5874 nan 0.1000 -0.0000
## 150 0.5844 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0576 nan 0.1000 0.0187
## 2 1.0276 nan 0.1000 0.0150
## 3 1.0031 nan 0.1000 0.0122
## 4 0.9825 nan 0.1000 0.0102
## 5 0.9649 nan 0.1000 0.0086
## 6 0.9459 nan 0.1000 0.0096
## 7 0.9324 nan 0.1000 0.0069
## 8 0.9166 nan 0.1000 0.0078
## 9 0.9038 nan 0.1000 0.0063
## 10 0.8908 nan 0.1000 0.0064
## 20 0.8132 nan 0.1000 0.0022
## 40 0.7344 nan 0.1000 0.0012
## 60 0.6931 nan 0.1000 0.0010
## 80 0.6686 nan 0.1000 0.0006
## 100 0.6537 nan 0.1000 0.0001
## 120 0.6420 nan 0.1000 0.0002
## 140 0.6340 nan 0.1000 0.0001
## 150 0.6308 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0429 nan 0.1000 0.0259
## 2 1.0035 nan 0.1000 0.0195
## 3 0.9707 nan 0.1000 0.0163
## 4 0.9350 nan 0.1000 0.0179
## 5 0.9069 nan 0.1000 0.0142
## 6 0.8845 nan 0.1000 0.0111
## 7 0.8678 nan 0.1000 0.0085
## 8 0.8520 nan 0.1000 0.0078
## 9 0.8362 nan 0.1000 0.0076
## 10 0.8251 nan 0.1000 0.0054
## 20 0.7463 nan 0.1000 0.0025
## 40 0.6747 nan 0.1000 0.0009
## 60 0.6449 nan 0.1000 0.0003
## 80 0.6289 nan 0.1000 0.0003
## 100 0.6174 nan 0.1000 0.0003
## 120 0.6096 nan 0.1000 0.0000
## 140 0.6039 nan 0.1000 0.0001
## 150 0.6013 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0349 nan 0.1000 0.0302
## 2 0.9895 nan 0.1000 0.0226
## 3 0.9526 nan 0.1000 0.0180
## 4 0.9206 nan 0.1000 0.0162
## 5 0.8944 nan 0.1000 0.0132
## 6 0.8693 nan 0.1000 0.0127
## 7 0.8500 nan 0.1000 0.0097
## 8 0.8300 nan 0.1000 0.0098
## 9 0.8164 nan 0.1000 0.0066
## 10 0.8019 nan 0.1000 0.0074
## 20 0.7178 nan 0.1000 0.0025
## 40 0.6516 nan 0.1000 0.0011
## 60 0.6248 nan 0.1000 0.0003
## 80 0.6090 nan 0.1000 0.0001
## 100 0.6007 nan 0.1000 0.0001
## 120 0.5930 nan 0.1000 0.0001
## 140 0.5872 nan 0.1000 0.0000
## 150 0.5854 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0585 nan 0.1000 0.0185
## 2 1.0284 nan 0.1000 0.0149
## 3 1.0044 nan 0.1000 0.0121
## 4 0.9838 nan 0.1000 0.0100
## 5 0.9641 nan 0.1000 0.0097
## 6 0.9469 nan 0.1000 0.0084
## 7 0.9340 nan 0.1000 0.0065
## 8 0.9183 nan 0.1000 0.0077
## 9 0.9056 nan 0.1000 0.0063
## 10 0.8959 nan 0.1000 0.0050
## 20 0.8168 nan 0.1000 0.0023
## 40 0.7369 nan 0.1000 0.0011
## 60 0.6944 nan 0.1000 0.0011
## 80 0.6713 nan 0.1000 0.0003
## 100 0.6564 nan 0.1000 0.0005
## 120 0.6449 nan 0.1000 0.0002
## 140 0.6367 nan 0.1000 0.0002
## 150 0.6336 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0446 nan 0.1000 0.0258
## 2 1.0054 nan 0.1000 0.0199
## 3 0.9733 nan 0.1000 0.0157
## 4 0.9378 nan 0.1000 0.0182
## 5 0.9105 nan 0.1000 0.0135
## 6 0.8884 nan 0.1000 0.0112
## 7 0.8701 nan 0.1000 0.0093
## 8 0.8562 nan 0.1000 0.0069
## 9 0.8420 nan 0.1000 0.0071
## 10 0.8314 nan 0.1000 0.0053
## 20 0.7477 nan 0.1000 0.0037
## 40 0.6770 nan 0.1000 0.0010
## 60 0.6477 nan 0.1000 0.0004
## 80 0.6324 nan 0.1000 0.0002
## 100 0.6205 nan 0.1000 0.0003
## 120 0.6122 nan 0.1000 0.0001
## 140 0.6067 nan 0.1000 0.0001
## 150 0.6040 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0359 nan 0.1000 0.0295
## 2 0.9897 nan 0.1000 0.0225
## 3 0.9522 nan 0.1000 0.0184
## 4 0.9207 nan 0.1000 0.0159
## 5 0.8954 nan 0.1000 0.0128
## 6 0.8698 nan 0.1000 0.0126
## 7 0.8508 nan 0.1000 0.0095
## 8 0.8325 nan 0.1000 0.0090
## 9 0.8192 nan 0.1000 0.0066
## 10 0.8046 nan 0.1000 0.0071
## 20 0.7202 nan 0.1000 0.0027
## 40 0.6540 nan 0.1000 0.0009
## 60 0.6274 nan 0.1000 0.0007
## 80 0.6106 nan 0.1000 0.0006
## 100 0.6004 nan 0.1000 0.0001
## 120 0.5948 nan 0.1000 0.0002
## 140 0.5889 nan 0.1000 0.0000
## 150 0.5871 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0585 nan 0.1000 0.0186
## 2 1.0281 nan 0.1000 0.0150
## 3 1.0031 nan 0.1000 0.0122
## 4 0.9828 nan 0.1000 0.0101
## 5 0.9632 nan 0.1000 0.0097
## 6 0.9455 nan 0.1000 0.0087
## 7 0.9324 nan 0.1000 0.0065
## 8 0.9164 nan 0.1000 0.0077
## 9 0.9034 nan 0.1000 0.0065
## 10 0.8940 nan 0.1000 0.0047
## 20 0.8128 nan 0.1000 0.0023
## 40 0.7335 nan 0.1000 0.0013
## 60 0.6927 nan 0.1000 0.0007
## 80 0.6686 nan 0.1000 0.0006
## 100 0.6529 nan 0.1000 0.0004
## 120 0.6424 nan 0.1000 0.0002
## 140 0.6347 nan 0.1000 0.0003
## 150 0.6319 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0423 nan 0.1000 0.0261
## 2 1.0025 nan 0.1000 0.0200
## 3 0.9693 nan 0.1000 0.0165
## 4 0.9426 nan 0.1000 0.0131
## 5 0.9136 nan 0.1000 0.0145
## 6 0.8932 nan 0.1000 0.0097
## 7 0.8724 nan 0.1000 0.0105
## 8 0.8561 nan 0.1000 0.0081
## 9 0.8425 nan 0.1000 0.0067
## 10 0.8292 nan 0.1000 0.0066
## 20 0.7465 nan 0.1000 0.0028
## 40 0.6750 nan 0.1000 0.0012
## 60 0.6458 nan 0.1000 0.0004
## 80 0.6298 nan 0.1000 0.0004
## 100 0.6182 nan 0.1000 0.0001
## 120 0.6098 nan 0.1000 0.0001
## 140 0.6043 nan 0.1000 0.0001
## 150 0.6015 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0353 nan 0.1000 0.0298
## 2 0.9874 nan 0.1000 0.0236
## 3 0.9512 nan 0.1000 0.0184
## 4 0.9188 nan 0.1000 0.0161
## 5 0.8936 nan 0.1000 0.0126
## 6 0.8708 nan 0.1000 0.0115
## 7 0.8492 nan 0.1000 0.0109
## 8 0.8310 nan 0.1000 0.0091
## 9 0.8173 nan 0.1000 0.0064
## 10 0.8047 nan 0.1000 0.0059
## 20 0.7172 nan 0.1000 0.0028
## 40 0.6512 nan 0.1000 0.0008
## 60 0.6241 nan 0.1000 0.0003
## 80 0.6086 nan 0.1000 0.0002
## 100 0.5981 nan 0.1000 0.0001
## 120 0.5908 nan 0.1000 0.0000
## 140 0.5852 nan 0.1000 -0.0000
## 150 0.5821 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0575 nan 0.1000 0.0187
## 2 1.0272 nan 0.1000 0.0150
## 3 1.0024 nan 0.1000 0.0122
## 4 0.9822 nan 0.1000 0.0105
## 5 0.9626 nan 0.1000 0.0098
## 6 0.9460 nan 0.1000 0.0086
## 7 0.9326 nan 0.1000 0.0065
## 8 0.9172 nan 0.1000 0.0078
## 9 0.9071 nan 0.1000 0.0049
## 10 0.8950 nan 0.1000 0.0063
## 20 0.8139 nan 0.1000 0.0035
## 40 0.7337 nan 0.1000 0.0013
## 60 0.6914 nan 0.1000 0.0005
## 80 0.6682 nan 0.1000 0.0003
## 100 0.6524 nan 0.1000 0.0003
## 120 0.6411 nan 0.1000 0.0001
## 140 0.6333 nan 0.1000 0.0000
## 150 0.6299 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0430 nan 0.1000 0.0259
## 2 1.0030 nan 0.1000 0.0199
## 3 0.9696 nan 0.1000 0.0169
## 4 0.9430 nan 0.1000 0.0135
## 5 0.9136 nan 0.1000 0.0147
## 6 0.8898 nan 0.1000 0.0117
## 7 0.8736 nan 0.1000 0.0078
## 8 0.8557 nan 0.1000 0.0091
## 9 0.8404 nan 0.1000 0.0077
## 10 0.8298 nan 0.1000 0.0054
## 20 0.7470 nan 0.1000 0.0028
## 40 0.6753 nan 0.1000 0.0013
## 60 0.6450 nan 0.1000 0.0003
## 80 0.6280 nan 0.1000 0.0002
## 100 0.6180 nan 0.1000 0.0001
## 120 0.6085 nan 0.1000 0.0001
## 140 0.6018 nan 0.1000 0.0000
## 150 0.5996 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0352 nan 0.1000 0.0296
## 2 0.9878 nan 0.1000 0.0240
## 3 0.9509 nan 0.1000 0.0184
## 4 0.9194 nan 0.1000 0.0158
## 5 0.8918 nan 0.1000 0.0134
## 6 0.8694 nan 0.1000 0.0109
## 7 0.8477 nan 0.1000 0.0110
## 8 0.8294 nan 0.1000 0.0091
## 9 0.8138 nan 0.1000 0.0079
## 10 0.8018 nan 0.1000 0.0057
## 20 0.7190 nan 0.1000 0.0028
## 40 0.6512 nan 0.1000 0.0012
## 60 0.6243 nan 0.1000 0.0005
## 80 0.6076 nan 0.1000 0.0004
## 100 0.5968 nan 0.1000 0.0001
## 120 0.5889 nan 0.1000 0.0001
## 140 0.5834 nan 0.1000 0.0002
## 150 0.5817 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0581 nan 0.1000 0.0186
## 2 1.0282 nan 0.1000 0.0150
## 3 1.0040 nan 0.1000 0.0122
## 4 0.9841 nan 0.1000 0.0103
## 5 0.9644 nan 0.1000 0.0098
## 6 0.9474 nan 0.1000 0.0083
## 7 0.9342 nan 0.1000 0.0064
## 8 0.9184 nan 0.1000 0.0078
## 9 0.9084 nan 0.1000 0.0048
## 10 0.8962 nan 0.1000 0.0061
## 20 0.8163 nan 0.1000 0.0021
## 40 0.7363 nan 0.1000 0.0010
## 60 0.6969 nan 0.1000 0.0007
## 80 0.6712 nan 0.1000 0.0003
## 100 0.6562 nan 0.1000 0.0002
## 120 0.6452 nan 0.1000 0.0002
## 140 0.6368 nan 0.1000 0.0002
## 150 0.6339 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0438 nan 0.1000 0.0257
## 2 1.0041 nan 0.1000 0.0199
## 3 0.9707 nan 0.1000 0.0170
## 4 0.9429 nan 0.1000 0.0135
## 5 0.9144 nan 0.1000 0.0142
## 6 0.8909 nan 0.1000 0.0118
## 7 0.8714 nan 0.1000 0.0097
## 8 0.8559 nan 0.1000 0.0079
## 9 0.8424 nan 0.1000 0.0066
## 10 0.8310 nan 0.1000 0.0058
## 20 0.7501 nan 0.1000 0.0026
## 40 0.6785 nan 0.1000 0.0012
## 60 0.6480 nan 0.1000 0.0008
## 80 0.6321 nan 0.1000 0.0003
## 100 0.6207 nan 0.1000 0.0001
## 120 0.6116 nan 0.1000 0.0002
## 140 0.6055 nan 0.1000 0.0001
## 150 0.6028 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0369 nan 0.1000 0.0297
## 2 0.9907 nan 0.1000 0.0226
## 3 0.9542 nan 0.1000 0.0182
## 4 0.9221 nan 0.1000 0.0163
## 5 0.8961 nan 0.1000 0.0130
## 6 0.8735 nan 0.1000 0.0113
## 7 0.8526 nan 0.1000 0.0106
## 8 0.8377 nan 0.1000 0.0073
## 9 0.8206 nan 0.1000 0.0084
## 10 0.8074 nan 0.1000 0.0064
## 20 0.7236 nan 0.1000 0.0031
## 40 0.6547 nan 0.1000 0.0014
## 60 0.6273 nan 0.1000 0.0007
## 80 0.6112 nan 0.1000 0.0001
## 100 0.6005 nan 0.1000 0.0002
## 120 0.5920 nan 0.1000 0.0001
## 140 0.5869 nan 0.1000 0.0000
## 150 0.5849 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0586 nan 0.1000 0.0186
## 2 1.0287 nan 0.1000 0.0150
## 3 1.0041 nan 0.1000 0.0122
## 4 0.9834 nan 0.1000 0.0108
## 5 0.9636 nan 0.1000 0.0097
## 6 0.9494 nan 0.1000 0.0073
## 7 0.9331 nan 0.1000 0.0083
## 8 0.9181 nan 0.1000 0.0077
## 9 0.9080 nan 0.1000 0.0048
## 10 0.8958 nan 0.1000 0.0061
## 20 0.8157 nan 0.1000 0.0035
## 40 0.7360 nan 0.1000 0.0012
## 60 0.6960 nan 0.1000 0.0004
## 80 0.6705 nan 0.1000 0.0005
## 100 0.6541 nan 0.1000 0.0004
## 120 0.6441 nan 0.1000 0.0002
## 140 0.6366 nan 0.1000 0.0001
## 150 0.6339 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0443 nan 0.1000 0.0258
## 2 1.0042 nan 0.1000 0.0196
## 3 0.9711 nan 0.1000 0.0171
## 4 0.9450 nan 0.1000 0.0129
## 5 0.9211 nan 0.1000 0.0117
## 6 0.8969 nan 0.1000 0.0125
## 7 0.8779 nan 0.1000 0.0093
## 8 0.8594 nan 0.1000 0.0091
## 9 0.8441 nan 0.1000 0.0079
## 10 0.8328 nan 0.1000 0.0056
## 20 0.7503 nan 0.1000 0.0033
## 40 0.6779 nan 0.1000 0.0015
## 60 0.6474 nan 0.1000 0.0004
## 80 0.6305 nan 0.1000 0.0003
## 100 0.6201 nan 0.1000 0.0001
## 120 0.6111 nan 0.1000 0.0003
## 140 0.6043 nan 0.1000 0.0000
## 150 0.6017 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0369 nan 0.1000 0.0294
## 2 0.9899 nan 0.1000 0.0237
## 3 0.9518 nan 0.1000 0.0188
## 4 0.9206 nan 0.1000 0.0156
## 5 0.8949 nan 0.1000 0.0127
## 6 0.8723 nan 0.1000 0.0111
## 7 0.8503 nan 0.1000 0.0109
## 8 0.8325 nan 0.1000 0.0089
## 9 0.8171 nan 0.1000 0.0077
## 10 0.8031 nan 0.1000 0.0070
## 20 0.7235 nan 0.1000 0.0023
## 40 0.6550 nan 0.1000 0.0007
## 60 0.6273 nan 0.1000 0.0003
## 80 0.6102 nan 0.1000 0.0001
## 100 0.5990 nan 0.1000 0.0001
## 120 0.5915 nan 0.1000 -0.0000
## 140 0.5858 nan 0.1000 0.0000
## 150 0.5830 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0582 nan 0.1000 0.0187
## 2 1.0284 nan 0.1000 0.0151
## 3 1.0039 nan 0.1000 0.0123
## 4 0.9837 nan 0.1000 0.0098
## 5 0.9638 nan 0.1000 0.0098
## 6 0.9498 nan 0.1000 0.0069
## 7 0.9337 nan 0.1000 0.0083
## 8 0.9182 nan 0.1000 0.0078
## 9 0.9053 nan 0.1000 0.0065
## 10 0.8959 nan 0.1000 0.0047
## 20 0.8123 nan 0.1000 0.0033
## 40 0.7337 nan 0.1000 0.0011
## 60 0.6934 nan 0.1000 0.0005
## 80 0.6691 nan 0.1000 0.0003
## 100 0.6528 nan 0.1000 0.0004
## 120 0.6416 nan 0.1000 0.0004
## 140 0.6335 nan 0.1000 0.0001
## 150 0.6304 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0435 nan 0.1000 0.0261
## 2 1.0035 nan 0.1000 0.0197
## 3 0.9701 nan 0.1000 0.0166
## 4 0.9427 nan 0.1000 0.0136
## 5 0.9131 nan 0.1000 0.0145
## 6 0.8897 nan 0.1000 0.0117
## 7 0.8734 nan 0.1000 0.0080
## 8 0.8580 nan 0.1000 0.0076
## 9 0.8417 nan 0.1000 0.0082
## 10 0.8302 nan 0.1000 0.0055
## 20 0.7468 nan 0.1000 0.0033
## 40 0.6754 nan 0.1000 0.0011
## 60 0.6448 nan 0.1000 0.0004
## 80 0.6291 nan 0.1000 0.0001
## 100 0.6183 nan 0.1000 0.0004
## 120 0.6104 nan 0.1000 0.0001
## 140 0.6036 nan 0.1000 0.0001
## 150 0.6006 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0357 nan 0.1000 0.0296
## 2 0.9880 nan 0.1000 0.0236
## 3 0.9496 nan 0.1000 0.0187
## 4 0.9196 nan 0.1000 0.0149
## 5 0.8900 nan 0.1000 0.0145
## 6 0.8687 nan 0.1000 0.0109
## 7 0.8509 nan 0.1000 0.0087
## 8 0.8320 nan 0.1000 0.0095
## 9 0.8162 nan 0.1000 0.0077
## 10 0.8012 nan 0.1000 0.0072
## 20 0.7207 nan 0.1000 0.0023
## 40 0.6512 nan 0.1000 0.0009
## 60 0.6241 nan 0.1000 0.0004
## 80 0.6084 nan 0.1000 0.0003
## 100 0.5982 nan 0.1000 0.0000
## 120 0.5908 nan 0.1000 0.0000
## 140 0.5848 nan 0.1000 0.0000
## 150 0.5826 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0574 nan 0.1000 0.0188
## 2 1.0276 nan 0.1000 0.0151
## 3 1.0030 nan 0.1000 0.0123
## 4 0.9830 nan 0.1000 0.0103
## 5 0.9633 nan 0.1000 0.0099
## 6 0.9457 nan 0.1000 0.0086
## 7 0.9294 nan 0.1000 0.0080
## 8 0.9166 nan 0.1000 0.0067
## 9 0.9067 nan 0.1000 0.0048
## 10 0.8939 nan 0.1000 0.0063
## 20 0.8124 nan 0.1000 0.0033
## 40 0.7343 nan 0.1000 0.0009
## 60 0.6920 nan 0.1000 0.0010
## 80 0.6676 nan 0.1000 0.0005
## 100 0.6515 nan 0.1000 0.0001
## 120 0.6409 nan 0.1000 0.0001
## 140 0.6343 nan 0.1000 0.0001
## 150 0.6309 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0431 nan 0.1000 0.0264
## 2 1.0034 nan 0.1000 0.0201
## 3 0.9701 nan 0.1000 0.0165
## 4 0.9434 nan 0.1000 0.0135
## 5 0.9142 nan 0.1000 0.0145
## 6 0.8907 nan 0.1000 0.0116
## 7 0.8747 nan 0.1000 0.0080
## 8 0.8568 nan 0.1000 0.0090
## 9 0.8423 nan 0.1000 0.0071
## 10 0.8284 nan 0.1000 0.0068
## 20 0.7486 nan 0.1000 0.0029
## 40 0.6760 nan 0.1000 0.0007
## 60 0.6452 nan 0.1000 0.0006
## 80 0.6282 nan 0.1000 0.0002
## 100 0.6176 nan 0.1000 0.0002
## 120 0.6083 nan 0.1000 0.0001
## 140 0.6019 nan 0.1000 0.0002
## 150 0.5993 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0361 nan 0.1000 0.0300
## 2 0.9890 nan 0.1000 0.0234
## 3 0.9509 nan 0.1000 0.0192
## 4 0.9192 nan 0.1000 0.0156
## 5 0.8924 nan 0.1000 0.0134
## 6 0.8673 nan 0.1000 0.0124
## 7 0.8462 nan 0.1000 0.0105
## 8 0.8284 nan 0.1000 0.0087
## 9 0.8158 nan 0.1000 0.0062
## 10 0.8015 nan 0.1000 0.0071
## 20 0.7204 nan 0.1000 0.0023
## 40 0.6528 nan 0.1000 0.0014
## 60 0.6258 nan 0.1000 0.0006
## 80 0.6083 nan 0.1000 0.0004
## 100 0.5978 nan 0.1000 0.0001
## 120 0.5906 nan 0.1000 0.0000
## 140 0.5846 nan 0.1000 0.0001
## 150 0.5824 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0572 nan 0.1000 0.0186
## 2 1.0275 nan 0.1000 0.0150
## 3 1.0031 nan 0.1000 0.0122
## 4 0.9827 nan 0.1000 0.0104
## 5 0.9629 nan 0.1000 0.0098
## 6 0.9463 nan 0.1000 0.0083
## 7 0.9334 nan 0.1000 0.0066
## 8 0.9176 nan 0.1000 0.0078
## 9 0.9049 nan 0.1000 0.0064
## 10 0.8952 nan 0.1000 0.0048
## 20 0.8142 nan 0.1000 0.0023
## 40 0.7339 nan 0.1000 0.0011
## 60 0.6939 nan 0.1000 0.0008
## 80 0.6688 nan 0.1000 0.0004
## 100 0.6532 nan 0.1000 0.0003
## 120 0.6423 nan 0.1000 0.0002
## 140 0.6339 nan 0.1000 0.0002
## 150 0.6307 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0438 nan 0.1000 0.0260
## 2 1.0035 nan 0.1000 0.0200
## 3 0.9718 nan 0.1000 0.0158
## 4 0.9361 nan 0.1000 0.0179
## 5 0.9138 nan 0.1000 0.0108
## 6 0.8888 nan 0.1000 0.0124
## 7 0.8687 nan 0.1000 0.0102
## 8 0.8533 nan 0.1000 0.0080
## 9 0.8379 nan 0.1000 0.0076
## 10 0.8271 nan 0.1000 0.0053
## 20 0.7450 nan 0.1000 0.0027
## 40 0.6754 nan 0.1000 0.0012
## 60 0.6454 nan 0.1000 0.0006
## 80 0.6296 nan 0.1000 0.0004
## 100 0.6184 nan 0.1000 0.0003
## 120 0.6104 nan 0.1000 0.0002
## 140 0.6037 nan 0.1000 0.0001
## 150 0.6008 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0355 nan 0.1000 0.0296
## 2 0.9891 nan 0.1000 0.0231
## 3 0.9519 nan 0.1000 0.0185
## 4 0.9195 nan 0.1000 0.0161
## 5 0.8945 nan 0.1000 0.0125
## 6 0.8685 nan 0.1000 0.0128
## 7 0.8497 nan 0.1000 0.0092
## 8 0.8311 nan 0.1000 0.0094
## 9 0.8154 nan 0.1000 0.0077
## 10 0.8038 nan 0.1000 0.0054
## 20 0.7178 nan 0.1000 0.0025
## 40 0.6533 nan 0.1000 0.0009
## 60 0.6251 nan 0.1000 0.0004
## 80 0.6081 nan 0.1000 0.0002
## 100 0.5981 nan 0.1000 0.0002
## 120 0.5905 nan 0.1000 0.0000
## 140 0.5843 nan 0.1000 0.0001
## 150 0.5818 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0584 nan 0.1000 0.0185
## 2 1.0287 nan 0.1000 0.0149
## 3 1.0044 nan 0.1000 0.0121
## 4 0.9847 nan 0.1000 0.0101
## 5 0.9648 nan 0.1000 0.0097
## 6 0.9511 nan 0.1000 0.0064
## 7 0.9345 nan 0.1000 0.0084
## 8 0.9190 nan 0.1000 0.0077
## 9 0.9057 nan 0.1000 0.0064
## 10 0.8960 nan 0.1000 0.0048
## 20 0.8141 nan 0.1000 0.0033
## 40 0.7376 nan 0.1000 0.0008
## 60 0.6960 nan 0.1000 0.0005
## 80 0.6717 nan 0.1000 0.0006
## 100 0.6565 nan 0.1000 0.0002
## 120 0.6454 nan 0.1000 0.0003
## 140 0.6382 nan 0.1000 0.0001
## 150 0.6344 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0437 nan 0.1000 0.0259
## 2 1.0052 nan 0.1000 0.0195
## 3 0.9719 nan 0.1000 0.0162
## 4 0.9444 nan 0.1000 0.0137
## 5 0.9153 nan 0.1000 0.0148
## 6 0.8919 nan 0.1000 0.0117
## 7 0.8742 nan 0.1000 0.0086
## 8 0.8592 nan 0.1000 0.0074
## 9 0.8433 nan 0.1000 0.0078
## 10 0.8323 nan 0.1000 0.0056
## 20 0.7503 nan 0.1000 0.0025
## 40 0.6799 nan 0.1000 0.0008
## 60 0.6488 nan 0.1000 0.0005
## 80 0.6319 nan 0.1000 0.0001
## 100 0.6208 nan 0.1000 0.0001
## 120 0.6118 nan 0.1000 0.0003
## 140 0.6059 nan 0.1000 -0.0000
## 150 0.6034 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0363 nan 0.1000 0.0298
## 2 0.9897 nan 0.1000 0.0234
## 3 0.9530 nan 0.1000 0.0186
## 4 0.9199 nan 0.1000 0.0161
## 5 0.8952 nan 0.1000 0.0124
## 6 0.8720 nan 0.1000 0.0115
## 7 0.8501 nan 0.1000 0.0109
## 8 0.8322 nan 0.1000 0.0088
## 9 0.8164 nan 0.1000 0.0078
## 10 0.8029 nan 0.1000 0.0068
## 20 0.7224 nan 0.1000 0.0029
## 40 0.6556 nan 0.1000 0.0007
## 60 0.6280 nan 0.1000 0.0004
## 80 0.6121 nan 0.1000 0.0001
## 100 0.6018 nan 0.1000 0.0000
## 120 0.5931 nan 0.1000 0.0001
## 140 0.5879 nan 0.1000 -0.0000
## 150 0.5861 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0581 nan 0.1000 0.0184
## 2 1.0284 nan 0.1000 0.0148
## 3 1.0044 nan 0.1000 0.0120
## 4 0.9838 nan 0.1000 0.0106
## 5 0.9656 nan 0.1000 0.0087
## 6 0.9468 nan 0.1000 0.0095
## 7 0.9310 nan 0.1000 0.0078
## 8 0.9179 nan 0.1000 0.0065
## 9 0.9050 nan 0.1000 0.0063
## 10 0.8952 nan 0.1000 0.0046
## 20 0.8144 nan 0.1000 0.0025
## 40 0.7359 nan 0.1000 0.0012
## 60 0.6947 nan 0.1000 0.0009
## 80 0.6708 nan 0.1000 0.0005
## 100 0.6542 nan 0.1000 0.0004
## 120 0.6449 nan 0.1000 0.0001
## 140 0.6372 nan 0.1000 0.0001
## 150 0.6328 nan 0.1000 0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0436 nan 0.1000 0.0259
## 2 1.0040 nan 0.1000 0.0198
## 3 0.9711 nan 0.1000 0.0163
## 4 0.9384 nan 0.1000 0.0164
## 5 0.9154 nan 0.1000 0.0117
## 6 0.8918 nan 0.1000 0.0117
## 7 0.8724 nan 0.1000 0.0095
## 8 0.8574 nan 0.1000 0.0074
## 9 0.8454 nan 0.1000 0.0060
## 10 0.8326 nan 0.1000 0.0064
## 20 0.7499 nan 0.1000 0.0028
## 40 0.6778 nan 0.1000 0.0010
## 60 0.6477 nan 0.1000 0.0005
## 80 0.6313 nan 0.1000 0.0002
## 100 0.6204 nan 0.1000 0.0003
## 120 0.6120 nan 0.1000 -0.0000
## 140 0.6050 nan 0.1000 0.0000
## 150 0.6022 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0363 nan 0.1000 0.0297
## 2 0.9889 nan 0.1000 0.0239
## 3 0.9512 nan 0.1000 0.0189
## 4 0.9196 nan 0.1000 0.0158
## 5 0.8946 nan 0.1000 0.0127
## 6 0.8696 nan 0.1000 0.0124
## 7 0.8480 nan 0.1000 0.0107
## 8 0.8316 nan 0.1000 0.0080
## 9 0.8158 nan 0.1000 0.0077
## 10 0.8043 nan 0.1000 0.0059
## 20 0.7193 nan 0.1000 0.0025
## 40 0.6533 nan 0.1000 0.0010
## 60 0.6241 nan 0.1000 0.0003
## 80 0.6096 nan 0.1000 0.0000
## 100 0.5982 nan 0.1000 0.0000
## 120 0.5914 nan 0.1000 0.0002
## 140 0.5852 nan 0.1000 0.0000
## 150 0.5826 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0583 nan 0.1000 0.0188
## 2 1.0281 nan 0.1000 0.0151
## 3 1.0037 nan 0.1000 0.0123
## 4 0.9834 nan 0.1000 0.0106
## 5 0.9636 nan 0.1000 0.0098
## 6 0.9467 nan 0.1000 0.0084
## 7 0.9308 nan 0.1000 0.0080
## 8 0.9182 nan 0.1000 0.0065
## 9 0.9084 nan 0.1000 0.0049
## 10 0.8955 nan 0.1000 0.0061
## 20 0.8138 nan 0.1000 0.0033
## 40 0.7363 nan 0.1000 0.0014
## 60 0.6956 nan 0.1000 0.0006
## 80 0.6707 nan 0.1000 0.0006
## 100 0.6552 nan 0.1000 0.0001
## 120 0.6443 nan 0.1000 0.0001
## 140 0.6358 nan 0.1000 0.0003
## 150 0.6327 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0440 nan 0.1000 0.0258
## 2 1.0039 nan 0.1000 0.0197
## 3 0.9708 nan 0.1000 0.0165
## 4 0.9436 nan 0.1000 0.0137
## 5 0.9146 nan 0.1000 0.0150
## 6 0.8911 nan 0.1000 0.0116
## 7 0.8738 nan 0.1000 0.0088
## 8 0.8578 nan 0.1000 0.0076
## 9 0.8431 nan 0.1000 0.0074
## 10 0.8311 nan 0.1000 0.0061
## 20 0.7490 nan 0.1000 0.0028
## 40 0.6753 nan 0.1000 0.0010
## 60 0.6460 nan 0.1000 0.0006
## 80 0.6300 nan 0.1000 0.0005
## 100 0.6200 nan 0.1000 0.0002
## 120 0.6123 nan 0.1000 0.0001
## 140 0.6060 nan 0.1000 0.0000
## 150 0.6040 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0358 nan 0.1000 0.0296
## 2 0.9896 nan 0.1000 0.0231
## 3 0.9522 nan 0.1000 0.0185
## 4 0.9201 nan 0.1000 0.0156
## 5 0.8948 nan 0.1000 0.0122
## 6 0.8719 nan 0.1000 0.0114
## 7 0.8495 nan 0.1000 0.0110
## 8 0.8342 nan 0.1000 0.0076
## 9 0.8186 nan 0.1000 0.0077
## 10 0.8040 nan 0.1000 0.0072
## 20 0.7215 nan 0.1000 0.0028
## 40 0.6528 nan 0.1000 0.0010
## 60 0.6264 nan 0.1000 0.0005
## 80 0.6113 nan 0.1000 0.0002
## 100 0.6006 nan 0.1000 0.0000
## 120 0.5931 nan 0.1000 0.0001
## 140 0.5875 nan 0.1000 0.0001
## 150 0.5847 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0580 nan 0.1000 0.0188
## 2 1.0279 nan 0.1000 0.0151
## 3 1.0034 nan 0.1000 0.0123
## 4 0.9833 nan 0.1000 0.0100
## 5 0.9639 nan 0.1000 0.0099
## 6 0.9502 nan 0.1000 0.0066
## 7 0.9342 nan 0.1000 0.0081
## 8 0.9181 nan 0.1000 0.0079
## 9 0.9079 nan 0.1000 0.0050
## 10 0.8957 nan 0.1000 0.0061
## 20 0.8142 nan 0.1000 0.0035
## 40 0.7354 nan 0.1000 0.0015
## 60 0.6951 nan 0.1000 0.0007
## 80 0.6706 nan 0.1000 0.0006
## 100 0.6558 nan 0.1000 0.0001
## 120 0.6449 nan 0.1000 0.0002
## 140 0.6370 nan 0.1000 0.0001
## 150 0.6332 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0432 nan 0.1000 0.0262
## 2 1.0034 nan 0.1000 0.0203
## 3 0.9696 nan 0.1000 0.0167
## 4 0.9425 nan 0.1000 0.0135
## 5 0.9129 nan 0.1000 0.0146
## 6 0.8895 nan 0.1000 0.0119
## 7 0.8698 nan 0.1000 0.0097
## 8 0.8554 nan 0.1000 0.0072
## 9 0.8420 nan 0.1000 0.0066
## 10 0.8305 nan 0.1000 0.0054
## 20 0.7470 nan 0.1000 0.0033
## 40 0.6784 nan 0.1000 0.0008
## 60 0.6480 nan 0.1000 0.0006
## 80 0.6325 nan 0.1000 0.0003
## 100 0.6210 nan 0.1000 0.0001
## 120 0.6125 nan 0.1000 0.0001
## 140 0.6050 nan 0.1000 0.0001
## 150 0.6024 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0364 nan 0.1000 0.0295
## 2 0.9885 nan 0.1000 0.0235
## 3 0.9515 nan 0.1000 0.0184
## 4 0.9198 nan 0.1000 0.0156
## 5 0.8932 nan 0.1000 0.0133
## 6 0.8708 nan 0.1000 0.0109
## 7 0.8487 nan 0.1000 0.0110
## 8 0.8309 nan 0.1000 0.0089
## 9 0.8148 nan 0.1000 0.0077
## 10 0.8006 nan 0.1000 0.0070
## 20 0.7194 nan 0.1000 0.0029
## 40 0.6530 nan 0.1000 0.0008
## 60 0.6261 nan 0.1000 0.0003
## 80 0.6102 nan 0.1000 0.0002
## 100 0.5983 nan 0.1000 0.0001
## 120 0.5909 nan 0.1000 0.0002
## 140 0.5858 nan 0.1000 0.0001
## 150 0.5838 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0587 nan 0.1000 0.0184
## 2 1.0290 nan 0.1000 0.0149
## 3 1.0071 nan 0.1000 0.0107
## 4 0.9834 nan 0.1000 0.0118
## 5 0.9639 nan 0.1000 0.0097
## 6 0.9462 nan 0.1000 0.0085
## 7 0.9330 nan 0.1000 0.0063
## 8 0.9175 nan 0.1000 0.0077
## 9 0.9048 nan 0.1000 0.0064
## 10 0.8952 nan 0.1000 0.0047
## 20 0.8129 nan 0.1000 0.0032
## 40 0.7380 nan 0.1000 0.0011
## 60 0.6947 nan 0.1000 0.0007
## 80 0.6707 nan 0.1000 0.0003
## 100 0.6557 nan 0.1000 0.0002
## 120 0.6441 nan 0.1000 0.0001
## 140 0.6367 nan 0.1000 0.0001
## 150 0.6336 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0436 nan 0.1000 0.0255
## 2 1.0047 nan 0.1000 0.0197
## 3 0.9717 nan 0.1000 0.0163
## 4 0.9457 nan 0.1000 0.0132
## 5 0.9168 nan 0.1000 0.0148
## 6 0.8932 nan 0.1000 0.0115
## 7 0.8741 nan 0.1000 0.0097
## 8 0.8597 nan 0.1000 0.0073
## 9 0.8445 nan 0.1000 0.0075
## 10 0.8336 nan 0.1000 0.0055
## 20 0.7530 nan 0.1000 0.0025
## 40 0.6792 nan 0.1000 0.0011
## 60 0.6486 nan 0.1000 0.0003
## 80 0.6326 nan 0.1000 0.0003
## 100 0.6219 nan 0.1000 0.0001
## 120 0.6130 nan 0.1000 0.0003
## 140 0.6068 nan 0.1000 0.0000
## 150 0.6042 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0365 nan 0.1000 0.0289
## 2 0.9893 nan 0.1000 0.0234
## 3 0.9526 nan 0.1000 0.0182
## 4 0.9207 nan 0.1000 0.0157
## 5 0.8957 nan 0.1000 0.0125
## 6 0.8730 nan 0.1000 0.0115
## 7 0.8517 nan 0.1000 0.0106
## 8 0.8335 nan 0.1000 0.0091
## 9 0.8206 nan 0.1000 0.0062
## 10 0.8062 nan 0.1000 0.0073
## 20 0.7205 nan 0.1000 0.0031
## 40 0.6538 nan 0.1000 0.0010
## 60 0.6262 nan 0.1000 0.0004
## 80 0.6103 nan 0.1000 0.0002
## 100 0.6012 nan 0.1000 0.0002
## 120 0.5944 nan 0.1000 0.0000
## 140 0.5887 nan 0.1000 -0.0001
## 150 0.5867 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0586 nan 0.1000 0.0186
## 2 1.0285 nan 0.1000 0.0149
## 3 1.0042 nan 0.1000 0.0122
## 4 0.9841 nan 0.1000 0.0103
## 5 0.9649 nan 0.1000 0.0097
## 6 0.9481 nan 0.1000 0.0085
## 7 0.9350 nan 0.1000 0.0066
## 8 0.9197 nan 0.1000 0.0078
## 9 0.9096 nan 0.1000 0.0051
## 10 0.8968 nan 0.1000 0.0062
## 20 0.8160 nan 0.1000 0.0022
## 40 0.7366 nan 0.1000 0.0017
## 60 0.6954 nan 0.1000 0.0005
## 80 0.6704 nan 0.1000 0.0004
## 100 0.6561 nan 0.1000 0.0001
## 120 0.6443 nan 0.1000 0.0001
## 140 0.6369 nan 0.1000 0.0002
## 150 0.6338 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0431 nan 0.1000 0.0261
## 2 1.0040 nan 0.1000 0.0196
## 3 0.9708 nan 0.1000 0.0167
## 4 0.9383 nan 0.1000 0.0162
## 5 0.9146 nan 0.1000 0.0118
## 6 0.8912 nan 0.1000 0.0117
## 7 0.8716 nan 0.1000 0.0094
## 8 0.8574 nan 0.1000 0.0072
## 9 0.8456 nan 0.1000 0.0060
## 10 0.8325 nan 0.1000 0.0064
## 20 0.7490 nan 0.1000 0.0029
## 40 0.6783 nan 0.1000 0.0010
## 60 0.6478 nan 0.1000 0.0003
## 80 0.6314 nan 0.1000 0.0002
## 100 0.6203 nan 0.1000 0.0001
## 120 0.6121 nan 0.1000 0.0002
## 140 0.6067 nan 0.1000 0.0000
## 150 0.6043 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0372 nan 0.1000 0.0296
## 2 0.9901 nan 0.1000 0.0232
## 3 0.9525 nan 0.1000 0.0187
## 4 0.9203 nan 0.1000 0.0162
## 5 0.8944 nan 0.1000 0.0131
## 6 0.8728 nan 0.1000 0.0106
## 7 0.8516 nan 0.1000 0.0105
## 8 0.8329 nan 0.1000 0.0096
## 9 0.8171 nan 0.1000 0.0078
## 10 0.8041 nan 0.1000 0.0064
## 20 0.7197 nan 0.1000 0.0027
## 40 0.6545 nan 0.1000 0.0011
## 60 0.6279 nan 0.1000 0.0005
## 80 0.6134 nan 0.1000 0.0002
## 100 0.6028 nan 0.1000 0.0001
## 120 0.5956 nan 0.1000 0.0000
## 140 0.5900 nan 0.1000 -0.0000
## 150 0.5880 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0583 nan 0.1000 0.0187
## 2 1.0283 nan 0.1000 0.0151
## 3 1.0040 nan 0.1000 0.0123
## 4 0.9833 nan 0.1000 0.0104
## 5 0.9633 nan 0.1000 0.0098
## 6 0.9459 nan 0.1000 0.0086
## 7 0.9326 nan 0.1000 0.0067
## 8 0.9173 nan 0.1000 0.0078
## 9 0.9070 nan 0.1000 0.0051
## 10 0.8947 nan 0.1000 0.0062
## 20 0.8133 nan 0.1000 0.0035
## 40 0.7342 nan 0.1000 0.0016
## 60 0.6933 nan 0.1000 0.0010
## 80 0.6695 nan 0.1000 0.0004
## 100 0.6545 nan 0.1000 0.0001
## 120 0.6437 nan 0.1000 0.0002
## 140 0.6354 nan 0.1000 0.0002
## 150 0.6325 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0439 nan 0.1000 0.0262
## 2 1.0042 nan 0.1000 0.0202
## 3 0.9703 nan 0.1000 0.0168
## 4 0.9436 nan 0.1000 0.0133
## 5 0.9142 nan 0.1000 0.0147
## 6 0.8908 nan 0.1000 0.0118
## 7 0.8712 nan 0.1000 0.0098
## 8 0.8553 nan 0.1000 0.0079
## 9 0.8414 nan 0.1000 0.0070
## 10 0.8302 nan 0.1000 0.0056
## 20 0.7469 nan 0.1000 0.0028
## 40 0.6757 nan 0.1000 0.0012
## 60 0.6457 nan 0.1000 0.0003
## 80 0.6302 nan 0.1000 0.0001
## 100 0.6188 nan 0.1000 0.0001
## 120 0.6108 nan 0.1000 0.0000
## 140 0.6046 nan 0.1000 0.0001
## 150 0.6019 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0364 nan 0.1000 0.0298
## 2 0.9891 nan 0.1000 0.0235
## 3 0.9524 nan 0.1000 0.0184
## 4 0.9210 nan 0.1000 0.0158
## 5 0.8941 nan 0.1000 0.0135
## 6 0.8717 nan 0.1000 0.0111
## 7 0.8505 nan 0.1000 0.0109
## 8 0.8316 nan 0.1000 0.0094
## 9 0.8149 nan 0.1000 0.0082
## 10 0.8036 nan 0.1000 0.0052
## 20 0.7208 nan 0.1000 0.0029
## 40 0.6531 nan 0.1000 0.0008
## 60 0.6252 nan 0.1000 0.0003
## 80 0.6084 nan 0.1000 0.0002
## 100 0.5987 nan 0.1000 0.0000
## 120 0.5916 nan 0.1000 0.0001
## 140 0.5872 nan 0.1000 0.0001
## 150 0.5848 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0581 nan 0.1000 0.0185
## 2 1.0288 nan 0.1000 0.0150
## 3 1.0042 nan 0.1000 0.0122
## 4 0.9838 nan 0.1000 0.0099
## 5 0.9646 nan 0.1000 0.0090
## 6 0.9503 nan 0.1000 0.0072
## 7 0.9337 nan 0.1000 0.0083
## 8 0.9201 nan 0.1000 0.0064
## 9 0.9048 nan 0.1000 0.0076
## 10 0.8919 nan 0.1000 0.0063
## 20 0.8142 nan 0.1000 0.0034
## 40 0.7351 nan 0.1000 0.0013
## 60 0.6948 nan 0.1000 0.0010
## 80 0.6708 nan 0.1000 0.0005
## 100 0.6549 nan 0.1000 0.0005
## 120 0.6443 nan 0.1000 0.0001
## 140 0.6368 nan 0.1000 -0.0000
## 150 0.6338 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0435 nan 0.1000 0.0258
## 2 1.0041 nan 0.1000 0.0195
## 3 0.9712 nan 0.1000 0.0165
## 4 0.9433 nan 0.1000 0.0135
## 5 0.9149 nan 0.1000 0.0143
## 6 0.8914 nan 0.1000 0.0119
## 7 0.8750 nan 0.1000 0.0080
## 8 0.8604 nan 0.1000 0.0073
## 9 0.8438 nan 0.1000 0.0083
## 10 0.8322 nan 0.1000 0.0056
## 20 0.7523 nan 0.1000 0.0026
## 40 0.6788 nan 0.1000 0.0008
## 60 0.6494 nan 0.1000 0.0004
## 80 0.6336 nan 0.1000 0.0003
## 100 0.6214 nan 0.1000 0.0004
## 120 0.6137 nan 0.1000 0.0000
## 140 0.6064 nan 0.1000 0.0000
## 150 0.6035 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0356 nan 0.1000 0.0296
## 2 0.9893 nan 0.1000 0.0231
## 3 0.9518 nan 0.1000 0.0188
## 4 0.9205 nan 0.1000 0.0155
## 5 0.8952 nan 0.1000 0.0124
## 6 0.8726 nan 0.1000 0.0108
## 7 0.8540 nan 0.1000 0.0094
## 8 0.8349 nan 0.1000 0.0092
## 9 0.8208 nan 0.1000 0.0066
## 10 0.8069 nan 0.1000 0.0067
## 20 0.7217 nan 0.1000 0.0029
## 40 0.6548 nan 0.1000 0.0009
## 60 0.6284 nan 0.1000 0.0002
## 80 0.6129 nan 0.1000 0.0001
## 100 0.6023 nan 0.1000 0.0001
## 120 0.5958 nan 0.1000 0.0002
## 140 0.5887 nan 0.1000 0.0003
## 150 0.5852 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0584 nan 0.1000 0.0187
## 2 1.0284 nan 0.1000 0.0151
## 3 1.0035 nan 0.1000 0.0123
## 4 0.9832 nan 0.1000 0.0105
## 5 0.9638 nan 0.1000 0.0098
## 6 0.9502 nan 0.1000 0.0068
## 7 0.9341 nan 0.1000 0.0082
## 8 0.9179 nan 0.1000 0.0078
## 9 0.9052 nan 0.1000 0.0063
## 10 0.8923 nan 0.1000 0.0063
## 20 0.8140 nan 0.1000 0.0023
## 40 0.7343 nan 0.1000 0.0012
## 60 0.6939 nan 0.1000 0.0009
## 80 0.6692 nan 0.1000 0.0004
## 100 0.6531 nan 0.1000 0.0003
## 120 0.6429 nan 0.1000 0.0001
## 140 0.6348 nan 0.1000 0.0003
## 150 0.6316 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0432 nan 0.1000 0.0258
## 2 1.0032 nan 0.1000 0.0197
## 3 0.9699 nan 0.1000 0.0169
## 4 0.9440 nan 0.1000 0.0131
## 5 0.9154 nan 0.1000 0.0147
## 6 0.8919 nan 0.1000 0.0119
## 7 0.8722 nan 0.1000 0.0100
## 8 0.8572 nan 0.1000 0.0078
## 9 0.8430 nan 0.1000 0.0071
## 10 0.8322 nan 0.1000 0.0054
## 20 0.7492 nan 0.1000 0.0026
## 40 0.6761 nan 0.1000 0.0014
## 60 0.6455 nan 0.1000 0.0005
## 80 0.6299 nan 0.1000 0.0001
## 100 0.6185 nan 0.1000 0.0001
## 120 0.6101 nan 0.1000 0.0001
## 140 0.6028 nan 0.1000 0.0000
## 150 0.6004 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0361 nan 0.1000 0.0297
## 2 0.9902 nan 0.1000 0.0233
## 3 0.9533 nan 0.1000 0.0183
## 4 0.9218 nan 0.1000 0.0157
## 5 0.8920 nan 0.1000 0.0151
## 6 0.8693 nan 0.1000 0.0112
## 7 0.8481 nan 0.1000 0.0103
## 8 0.8319 nan 0.1000 0.0078
## 9 0.8160 nan 0.1000 0.0078
## 10 0.8013 nan 0.1000 0.0074
## 20 0.7187 nan 0.1000 0.0030
## 40 0.6517 nan 0.1000 0.0006
## 60 0.6255 nan 0.1000 0.0005
## 80 0.6093 nan 0.1000 0.0001
## 100 0.6001 nan 0.1000 0.0001
## 120 0.5919 nan 0.1000 0.0000
## 140 0.5856 nan 0.1000 0.0000
## 150 0.5832 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0575 nan 0.1000 0.0186
## 2 1.0270 nan 0.1000 0.0150
## 3 1.0028 nan 0.1000 0.0122
## 4 0.9829 nan 0.1000 0.0097
## 5 0.9679 nan 0.1000 0.0070
## 6 0.9490 nan 0.1000 0.0096
## 7 0.9323 nan 0.1000 0.0081
## 8 0.9165 nan 0.1000 0.0078
## 9 0.9041 nan 0.1000 0.0061
## 10 0.8943 nan 0.1000 0.0048
## 20 0.8141 nan 0.1000 0.0023
## 40 0.7365 nan 0.1000 0.0016
## 60 0.6952 nan 0.1000 0.0008
## 80 0.6709 nan 0.1000 0.0003
## 100 0.6553 nan 0.1000 0.0003
## 120 0.6452 nan 0.1000 0.0001
## 140 0.6352 nan 0.1000 0.0001
## 150 0.6323 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0431 nan 0.1000 0.0259
## 2 1.0037 nan 0.1000 0.0196
## 3 0.9702 nan 0.1000 0.0169
## 4 0.9436 nan 0.1000 0.0134
## 5 0.9144 nan 0.1000 0.0146
## 6 0.8907 nan 0.1000 0.0116
## 7 0.8730 nan 0.1000 0.0086
## 8 0.8569 nan 0.1000 0.0080
## 9 0.8412 nan 0.1000 0.0081
## 10 0.8293 nan 0.1000 0.0060
## 20 0.7480 nan 0.1000 0.0029
## 40 0.6783 nan 0.1000 0.0010
## 60 0.6468 nan 0.1000 0.0004
## 80 0.6308 nan 0.1000 0.0004
## 100 0.6199 nan 0.1000 0.0002
## 120 0.6122 nan 0.1000 0.0000
## 140 0.6050 nan 0.1000 0.0001
## 150 0.6021 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0348 nan 0.1000 0.0297
## 2 0.9884 nan 0.1000 0.0235
## 3 0.9515 nan 0.1000 0.0187
## 4 0.9189 nan 0.1000 0.0163
## 5 0.8931 nan 0.1000 0.0126
## 6 0.8707 nan 0.1000 0.0110
## 7 0.8521 nan 0.1000 0.0095
## 8 0.8333 nan 0.1000 0.0094
## 9 0.8203 nan 0.1000 0.0064
## 10 0.8069 nan 0.1000 0.0066
## 20 0.7256 nan 0.1000 0.0024
## 40 0.6563 nan 0.1000 0.0010
## 60 0.6292 nan 0.1000 0.0005
## 80 0.6112 nan 0.1000 0.0001
## 100 0.6005 nan 0.1000 -0.0000
## 120 0.5940 nan 0.1000 0.0000
## 140 0.5870 nan 0.1000 0.0002
## 150 0.5850 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0581 nan 0.1000 0.0187
## 2 1.0281 nan 0.1000 0.0151
## 3 1.0032 nan 0.1000 0.0123
## 4 0.9827 nan 0.1000 0.0105
## 5 0.9627 nan 0.1000 0.0098
## 6 0.9492 nan 0.1000 0.0061
## 7 0.9331 nan 0.1000 0.0079
## 8 0.9163 nan 0.1000 0.0081
## 9 0.9063 nan 0.1000 0.0045
## 10 0.8935 nan 0.1000 0.0062
## 20 0.8112 nan 0.1000 0.0034
## 40 0.7354 nan 0.1000 0.0012
## 60 0.6934 nan 0.1000 0.0008
## 80 0.6688 nan 0.1000 0.0002
## 100 0.6536 nan 0.1000 0.0004
## 120 0.6416 nan 0.1000 0.0002
## 140 0.6344 nan 0.1000 0.0002
## 150 0.6312 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0443 nan 0.1000 0.0260
## 2 1.0051 nan 0.1000 0.0198
## 3 0.9720 nan 0.1000 0.0166
## 4 0.9446 nan 0.1000 0.0138
## 5 0.9156 nan 0.1000 0.0147
## 6 0.8923 nan 0.1000 0.0119
## 7 0.8747 nan 0.1000 0.0086
## 8 0.8582 nan 0.1000 0.0080
## 9 0.8423 nan 0.1000 0.0081
## 10 0.8308 nan 0.1000 0.0056
## 20 0.7479 nan 0.1000 0.0034
## 40 0.6775 nan 0.1000 0.0009
## 60 0.6456 nan 0.1000 0.0004
## 80 0.6290 nan 0.1000 0.0002
## 100 0.6187 nan 0.1000 0.0002
## 120 0.6097 nan 0.1000 0.0001
## 140 0.6034 nan 0.1000 0.0001
## 150 0.6013 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0371 nan 0.1000 0.0296
## 2 0.9897 nan 0.1000 0.0239
## 3 0.9524 nan 0.1000 0.0189
## 4 0.9203 nan 0.1000 0.0163
## 5 0.8956 nan 0.1000 0.0122
## 6 0.8703 nan 0.1000 0.0127
## 7 0.8486 nan 0.1000 0.0107
## 8 0.8330 nan 0.1000 0.0077
## 9 0.8169 nan 0.1000 0.0078
## 10 0.8058 nan 0.1000 0.0053
## 20 0.7204 nan 0.1000 0.0025
## 40 0.6523 nan 0.1000 0.0011
## 60 0.6250 nan 0.1000 0.0006
## 80 0.6098 nan 0.1000 0.0002
## 100 0.5997 nan 0.1000 0.0001
## 120 0.5913 nan 0.1000 0.0000
## 140 0.5854 nan 0.1000 0.0001
## 150 0.5832 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0580 nan 0.1000 0.0187
## 2 1.0271 nan 0.1000 0.0151
## 3 1.0026 nan 0.1000 0.0122
## 4 0.9822 nan 0.1000 0.0097
## 5 0.9626 nan 0.1000 0.0098
## 6 0.9452 nan 0.1000 0.0086
## 7 0.9320 nan 0.1000 0.0064
## 8 0.9166 nan 0.1000 0.0078
## 9 0.9039 nan 0.1000 0.0064
## 10 0.8943 nan 0.1000 0.0048
## 20 0.8134 nan 0.1000 0.0024
## 40 0.7339 nan 0.1000 0.0020
## 60 0.6935 nan 0.1000 0.0007
## 80 0.6676 nan 0.1000 0.0007
## 100 0.6526 nan 0.1000 0.0004
## 120 0.6420 nan 0.1000 0.0001
## 140 0.6354 nan 0.1000 0.0001
## 150 0.6320 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0434 nan 0.1000 0.0262
## 2 1.0038 nan 0.1000 0.0200
## 3 0.9699 nan 0.1000 0.0164
## 4 0.9434 nan 0.1000 0.0134
## 5 0.9139 nan 0.1000 0.0148
## 6 0.8900 nan 0.1000 0.0121
## 7 0.8704 nan 0.1000 0.0100
## 8 0.8561 nan 0.1000 0.0071
## 9 0.8410 nan 0.1000 0.0075
## 10 0.8299 nan 0.1000 0.0053
## 20 0.7477 nan 0.1000 0.0032
## 40 0.6767 nan 0.1000 0.0009
## 60 0.6464 nan 0.1000 0.0005
## 80 0.6295 nan 0.1000 0.0004
## 100 0.6189 nan 0.1000 0.0002
## 120 0.6102 nan 0.1000 0.0003
## 140 0.6042 nan 0.1000 -0.0000
## 150 0.6012 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0362 nan 0.1000 0.0296
## 2 0.9896 nan 0.1000 0.0235
## 3 0.9520 nan 0.1000 0.0188
## 4 0.9194 nan 0.1000 0.0160
## 5 0.8934 nan 0.1000 0.0130
## 6 0.8711 nan 0.1000 0.0111
## 7 0.8493 nan 0.1000 0.0110
## 8 0.8334 nan 0.1000 0.0078
## 9 0.8181 nan 0.1000 0.0076
## 10 0.8057 nan 0.1000 0.0061
## 20 0.7192 nan 0.1000 0.0026
## 40 0.6519 nan 0.1000 0.0007
## 60 0.6249 nan 0.1000 0.0005
## 80 0.6101 nan 0.1000 0.0003
## 100 0.6003 nan 0.1000 0.0001
## 120 0.5919 nan 0.1000 0.0000
## 140 0.5869 nan 0.1000 -0.0000
## 150 0.5839 nan 0.1000 0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0578 nan 0.1000 0.0188
## 2 1.0275 nan 0.1000 0.0151
## 3 1.0035 nan 0.1000 0.0123
## 4 0.9831 nan 0.1000 0.0100
## 5 0.9632 nan 0.1000 0.0099
## 6 0.9460 nan 0.1000 0.0087
## 7 0.9328 nan 0.1000 0.0062
## 8 0.9171 nan 0.1000 0.0078
## 9 0.9044 nan 0.1000 0.0064
## 10 0.8946 nan 0.1000 0.0049
## 20 0.8158 nan 0.1000 0.0025
## 40 0.7345 nan 0.1000 0.0016
## 60 0.6924 nan 0.1000 0.0010
## 80 0.6671 nan 0.1000 0.0004
## 100 0.6524 nan 0.1000 0.0001
## 120 0.6415 nan 0.1000 0.0003
## 140 0.6331 nan 0.1000 0.0001
## 150 0.6305 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0428 nan 0.1000 0.0260
## 2 1.0028 nan 0.1000 0.0197
## 3 0.9699 nan 0.1000 0.0163
## 4 0.9428 nan 0.1000 0.0139
## 5 0.9138 nan 0.1000 0.0146
## 6 0.8900 nan 0.1000 0.0118
## 7 0.8702 nan 0.1000 0.0096
## 8 0.8553 nan 0.1000 0.0076
## 9 0.8434 nan 0.1000 0.0060
## 10 0.8304 nan 0.1000 0.0065
## 20 0.7491 nan 0.1000 0.0028
## 40 0.6737 nan 0.1000 0.0012
## 60 0.6443 nan 0.1000 0.0002
## 80 0.6278 nan 0.1000 0.0000
## 100 0.6170 nan 0.1000 0.0000
## 120 0.6089 nan 0.1000 0.0001
## 140 0.6029 nan 0.1000 0.0001
## 150 0.6003 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0363 nan 0.1000 0.0300
## 2 0.9893 nan 0.1000 0.0237
## 3 0.9510 nan 0.1000 0.0189
## 4 0.9194 nan 0.1000 0.0158
## 5 0.8938 nan 0.1000 0.0126
## 6 0.8713 nan 0.1000 0.0114
## 7 0.8491 nan 0.1000 0.0111
## 8 0.8309 nan 0.1000 0.0091
## 9 0.8176 nan 0.1000 0.0065
## 10 0.8046 nan 0.1000 0.0065
## 20 0.7191 nan 0.1000 0.0032
## 40 0.6516 nan 0.1000 0.0006
## 60 0.6252 nan 0.1000 0.0004
## 80 0.6083 nan 0.1000 0.0001
## 100 0.5981 nan 0.1000 0.0001
## 120 0.5903 nan 0.1000 0.0000
## 140 0.5839 nan 0.1000 0.0001
## 150 0.5809 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0580 nan 0.1000 0.0184
## 2 1.0290 nan 0.1000 0.0148
## 3 1.0044 nan 0.1000 0.0121
## 4 0.9838 nan 0.1000 0.0098
## 5 0.9644 nan 0.1000 0.0097
## 6 0.9500 nan 0.1000 0.0070
## 7 0.9333 nan 0.1000 0.0083
## 8 0.9181 nan 0.1000 0.0077
## 9 0.9055 nan 0.1000 0.0065
## 10 0.8959 nan 0.1000 0.0049
## 20 0.8153 nan 0.1000 0.0024
## 40 0.7350 nan 0.1000 0.0015
## 60 0.6944 nan 0.1000 0.0005
## 80 0.6694 nan 0.1000 0.0006
## 100 0.6543 nan 0.1000 0.0002
## 120 0.6439 nan 0.1000 0.0002
## 140 0.6362 nan 0.1000 0.0001
## 150 0.6326 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0437 nan 0.1000 0.0257
## 2 1.0041 nan 0.1000 0.0198
## 3 0.9699 nan 0.1000 0.0162
## 4 0.9432 nan 0.1000 0.0131
## 5 0.9148 nan 0.1000 0.0144
## 6 0.8910 nan 0.1000 0.0116
## 7 0.8715 nan 0.1000 0.0095
## 8 0.8570 nan 0.1000 0.0072
## 9 0.8451 nan 0.1000 0.0059
## 10 0.8321 nan 0.1000 0.0066
## 20 0.7489 nan 0.1000 0.0027
## 40 0.6760 nan 0.1000 0.0008
## 60 0.6477 nan 0.1000 0.0005
## 80 0.6317 nan 0.1000 0.0004
## 100 0.6204 nan 0.1000 0.0002
## 120 0.6110 nan 0.1000 0.0002
## 140 0.6055 nan 0.1000 0.0002
## 150 0.6028 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0357 nan 0.1000 0.0293
## 2 0.9890 nan 0.1000 0.0234
## 3 0.9509 nan 0.1000 0.0191
## 4 0.9193 nan 0.1000 0.0156
## 5 0.8942 nan 0.1000 0.0124
## 6 0.8728 nan 0.1000 0.0103
## 7 0.8508 nan 0.1000 0.0112
## 8 0.8320 nan 0.1000 0.0093
## 9 0.8160 nan 0.1000 0.0081
## 10 0.8045 nan 0.1000 0.0057
## 20 0.7198 nan 0.1000 0.0026
## 40 0.6529 nan 0.1000 0.0007
## 60 0.6266 nan 0.1000 0.0004
## 80 0.6109 nan 0.1000 0.0001
## 100 0.6000 nan 0.1000 0.0001
## 120 0.5924 nan 0.1000 -0.0000
## 140 0.5871 nan 0.1000 0.0000
## 150 0.5844 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0576 nan 0.1000 0.0187
## 2 1.0271 nan 0.1000 0.0150
## 3 1.0033 nan 0.1000 0.0122
## 4 0.9831 nan 0.1000 0.0104
## 5 0.9635 nan 0.1000 0.0098
## 6 0.9463 nan 0.1000 0.0085
## 7 0.9332 nan 0.1000 0.0065
## 8 0.9179 nan 0.1000 0.0078
## 9 0.9046 nan 0.1000 0.0065
## 10 0.8916 nan 0.1000 0.0062
## 20 0.8140 nan 0.1000 0.0024
## 40 0.7362 nan 0.1000 0.0009
## 60 0.6947 nan 0.1000 0.0011
## 80 0.6694 nan 0.1000 0.0006
## 100 0.6543 nan 0.1000 0.0002
## 120 0.6435 nan 0.1000 0.0001
## 140 0.6359 nan 0.1000 0.0000
## 150 0.6327 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0436 nan 0.1000 0.0261
## 2 1.0038 nan 0.1000 0.0197
## 3 0.9707 nan 0.1000 0.0169
## 4 0.9442 nan 0.1000 0.0133
## 5 0.9152 nan 0.1000 0.0145
## 6 0.8915 nan 0.1000 0.0117
## 7 0.8721 nan 0.1000 0.0097
## 8 0.8557 nan 0.1000 0.0079
## 9 0.8424 nan 0.1000 0.0068
## 10 0.8312 nan 0.1000 0.0056
## 20 0.7492 nan 0.1000 0.0027
## 40 0.6767 nan 0.1000 0.0012
## 60 0.6461 nan 0.1000 0.0005
## 80 0.6307 nan 0.1000 0.0004
## 100 0.6195 nan 0.1000 0.0001
## 120 0.6116 nan 0.1000 0.0001
## 140 0.6045 nan 0.1000 0.0001
## 150 0.6018 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0370 nan 0.1000 0.0300
## 2 0.9905 nan 0.1000 0.0232
## 3 0.9530 nan 0.1000 0.0187
## 4 0.9209 nan 0.1000 0.0161
## 5 0.8948 nan 0.1000 0.0130
## 6 0.8725 nan 0.1000 0.0110
## 7 0.8535 nan 0.1000 0.0094
## 8 0.8345 nan 0.1000 0.0095
## 9 0.8180 nan 0.1000 0.0082
## 10 0.8058 nan 0.1000 0.0059
## 20 0.7208 nan 0.1000 0.0028
## 40 0.6539 nan 0.1000 0.0008
## 60 0.6273 nan 0.1000 0.0005
## 80 0.6098 nan 0.1000 0.0001
## 100 0.5999 nan 0.1000 0.0001
## 120 0.5921 nan 0.1000 0.0001
## 140 0.5868 nan 0.1000 0.0000
## 150 0.5845 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0582 nan 0.1000 0.0186
## 2 1.0282 nan 0.1000 0.0149
## 3 1.0035 nan 0.1000 0.0121
## 4 0.9832 nan 0.1000 0.0103
## 5 0.9640 nan 0.1000 0.0097
## 6 0.9471 nan 0.1000 0.0085
## 7 0.9339 nan 0.1000 0.0066
## 8 0.9184 nan 0.1000 0.0077
## 9 0.9087 nan 0.1000 0.0049
## 10 0.8959 nan 0.1000 0.0062
## 20 0.8137 nan 0.1000 0.0025
## 40 0.7360 nan 0.1000 0.0017
## 60 0.6956 nan 0.1000 0.0008
## 80 0.6705 nan 0.1000 0.0005
## 100 0.6545 nan 0.1000 0.0004
## 120 0.6433 nan 0.1000 0.0001
## 140 0.6350 nan 0.1000 0.0001
## 150 0.6323 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0437 nan 0.1000 0.0256
## 2 1.0041 nan 0.1000 0.0198
## 3 0.9712 nan 0.1000 0.0169
## 4 0.9436 nan 0.1000 0.0136
## 5 0.9141 nan 0.1000 0.0146
## 6 0.8910 nan 0.1000 0.0117
## 7 0.8714 nan 0.1000 0.0099
## 8 0.8559 nan 0.1000 0.0077
## 9 0.8437 nan 0.1000 0.0060
## 10 0.8308 nan 0.1000 0.0063
## 20 0.7500 nan 0.1000 0.0031
## 40 0.6765 nan 0.1000 0.0013
## 60 0.6467 nan 0.1000 0.0004
## 80 0.6303 nan 0.1000 0.0004
## 100 0.6186 nan 0.1000 0.0003
## 120 0.6107 nan 0.1000 0.0002
## 140 0.6045 nan 0.1000 -0.0000
## 150 0.6016 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0356 nan 0.1000 0.0296
## 2 0.9888 nan 0.1000 0.0231
## 3 0.9516 nan 0.1000 0.0188
## 4 0.9199 nan 0.1000 0.0155
## 5 0.8939 nan 0.1000 0.0129
## 6 0.8687 nan 0.1000 0.0121
## 7 0.8500 nan 0.1000 0.0092
## 8 0.8318 nan 0.1000 0.0090
## 9 0.8166 nan 0.1000 0.0075
## 10 0.8030 nan 0.1000 0.0068
## 20 0.7187 nan 0.1000 0.0035
## 40 0.6539 nan 0.1000 0.0005
## 60 0.6268 nan 0.1000 0.0007
## 80 0.6108 nan 0.1000 0.0002
## 100 0.6012 nan 0.1000 0.0001
## 120 0.5928 nan 0.1000 0.0002
## 140 0.5869 nan 0.1000 0.0001
## 150 0.5836 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0582 nan 0.1000 0.0184
## 2 1.0287 nan 0.1000 0.0148
## 3 1.0042 nan 0.1000 0.0121
## 4 0.9836 nan 0.1000 0.0102
## 5 0.9642 nan 0.1000 0.0096
## 6 0.9471 nan 0.1000 0.0084
## 7 0.9340 nan 0.1000 0.0064
## 8 0.9185 nan 0.1000 0.0076
## 9 0.9054 nan 0.1000 0.0063
## 10 0.8954 nan 0.1000 0.0049
## 20 0.8156 nan 0.1000 0.0034
## 40 0.7378 nan 0.1000 0.0009
## 60 0.6960 nan 0.1000 0.0008
## 80 0.6710 nan 0.1000 0.0004
## 100 0.6565 nan 0.1000 0.0003
## 120 0.6461 nan 0.1000 0.0003
## 140 0.6388 nan 0.1000 0.0002
## 150 0.6358 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0440 nan 0.1000 0.0261
## 2 1.0044 nan 0.1000 0.0196
## 3 0.9714 nan 0.1000 0.0165
## 4 0.9450 nan 0.1000 0.0131
## 5 0.9159 nan 0.1000 0.0144
## 6 0.8925 nan 0.1000 0.0117
## 7 0.8752 nan 0.1000 0.0085
## 8 0.8591 nan 0.1000 0.0080
## 9 0.8432 nan 0.1000 0.0078
## 10 0.8321 nan 0.1000 0.0056
## 20 0.7511 nan 0.1000 0.0032
## 40 0.6790 nan 0.1000 0.0011
## 60 0.6486 nan 0.1000 0.0005
## 80 0.6335 nan 0.1000 0.0002
## 100 0.6224 nan 0.1000 0.0000
## 120 0.6150 nan 0.1000 0.0001
## 140 0.6088 nan 0.1000 0.0001
## 150 0.6061 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0365 nan 0.1000 0.0290
## 2 0.9908 nan 0.1000 0.0231
## 3 0.9537 nan 0.1000 0.0187
## 4 0.9215 nan 0.1000 0.0159
## 5 0.8951 nan 0.1000 0.0131
## 6 0.8697 nan 0.1000 0.0124
## 7 0.8516 nan 0.1000 0.0090
## 8 0.8338 nan 0.1000 0.0089
## 9 0.8174 nan 0.1000 0.0083
## 10 0.8036 nan 0.1000 0.0067
## 20 0.7193 nan 0.1000 0.0026
## 40 0.6550 nan 0.1000 0.0009
## 60 0.6292 nan 0.1000 0.0003
## 80 0.6123 nan 0.1000 0.0001
## 100 0.6024 nan 0.1000 0.0001
## 120 0.5951 nan 0.1000 0.0001
## 140 0.5884 nan 0.1000 0.0000
## 150 0.5864 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0586 nan 0.1000 0.0186
## 2 1.0286 nan 0.1000 0.0150
## 3 1.0046 nan 0.1000 0.0122
## 4 0.9845 nan 0.1000 0.0102
## 5 0.9647 nan 0.1000 0.0098
## 6 0.9509 nan 0.1000 0.0068
## 7 0.9342 nan 0.1000 0.0083
## 8 0.9192 nan 0.1000 0.0078
## 9 0.9057 nan 0.1000 0.0064
## 10 0.8929 nan 0.1000 0.0063
## 20 0.8154 nan 0.1000 0.0024
## 40 0.7374 nan 0.1000 0.0015
## 60 0.6954 nan 0.1000 0.0005
## 80 0.6712 nan 0.1000 0.0003
## 100 0.6563 nan 0.1000 0.0006
## 120 0.6441 nan 0.1000 0.0002
## 140 0.6367 nan 0.1000 0.0001
## 150 0.6333 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0432 nan 0.1000 0.0260
## 2 1.0034 nan 0.1000 0.0197
## 3 0.9702 nan 0.1000 0.0164
## 4 0.9428 nan 0.1000 0.0136
## 5 0.9144 nan 0.1000 0.0143
## 6 0.8909 nan 0.1000 0.0113
## 7 0.8718 nan 0.1000 0.0095
## 8 0.8553 nan 0.1000 0.0081
## 9 0.8420 nan 0.1000 0.0067
## 10 0.8323 nan 0.1000 0.0047
## 20 0.7509 nan 0.1000 0.0026
## 40 0.6769 nan 0.1000 0.0010
## 60 0.6481 nan 0.1000 0.0003
## 80 0.6322 nan 0.1000 0.0002
## 100 0.6208 nan 0.1000 0.0003
## 120 0.6123 nan 0.1000 0.0002
## 140 0.6067 nan 0.1000 0.0001
## 150 0.6036 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0360 nan 0.1000 0.0296
## 2 0.9902 nan 0.1000 0.0234
## 3 0.9532 nan 0.1000 0.0181
## 4 0.9213 nan 0.1000 0.0162
## 5 0.8956 nan 0.1000 0.0128
## 6 0.8702 nan 0.1000 0.0127
## 7 0.8511 nan 0.1000 0.0094
## 8 0.8328 nan 0.1000 0.0090
## 9 0.8169 nan 0.1000 0.0076
## 10 0.8026 nan 0.1000 0.0070
## 20 0.7207 nan 0.1000 0.0027
## 40 0.6540 nan 0.1000 0.0012
## 60 0.6270 nan 0.1000 0.0003
## 80 0.6116 nan 0.1000 0.0001
## 100 0.6013 nan 0.1000 0.0000
## 120 0.5926 nan 0.1000 0.0000
## 140 0.5867 nan 0.1000 0.0000
## 150 0.5840 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0579 nan 0.1000 0.0188
## 2 1.0280 nan 0.1000 0.0152
## 3 1.0031 nan 0.1000 0.0124
## 4 0.9822 nan 0.1000 0.0103
## 5 0.9625 nan 0.1000 0.0099
## 6 0.9445 nan 0.1000 0.0085
## 7 0.9311 nan 0.1000 0.0067
## 8 0.9157 nan 0.1000 0.0079
## 9 0.9056 nan 0.1000 0.0049
## 10 0.8932 nan 0.1000 0.0063
## 20 0.8097 nan 0.1000 0.0031
## 40 0.7327 nan 0.1000 0.0011
## 60 0.6908 nan 0.1000 0.0009
## 80 0.6661 nan 0.1000 0.0006
## 100 0.6509 nan 0.1000 0.0002
## 120 0.6397 nan 0.1000 0.0003
## 140 0.6323 nan 0.1000 0.0001
## 150 0.6291 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0430 nan 0.1000 0.0267
## 2 1.0026 nan 0.1000 0.0202
## 3 0.9692 nan 0.1000 0.0167
## 4 0.9406 nan 0.1000 0.0138
## 5 0.9118 nan 0.1000 0.0146
## 6 0.8881 nan 0.1000 0.0117
## 7 0.8705 nan 0.1000 0.0088
## 8 0.8555 nan 0.1000 0.0074
## 9 0.8418 nan 0.1000 0.0066
## 10 0.8294 nan 0.1000 0.0061
## 20 0.7440 nan 0.1000 0.0027
## 40 0.6737 nan 0.1000 0.0009
## 60 0.6434 nan 0.1000 0.0004
## 80 0.6276 nan 0.1000 0.0001
## 100 0.6170 nan 0.1000 0.0001
## 120 0.6086 nan 0.1000 0.0001
## 140 0.6029 nan 0.1000 0.0001
## 150 0.6007 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0352 nan 0.1000 0.0296
## 2 0.9871 nan 0.1000 0.0239
## 3 0.9494 nan 0.1000 0.0193
## 4 0.9180 nan 0.1000 0.0157
## 5 0.8925 nan 0.1000 0.0126
## 6 0.8691 nan 0.1000 0.0115
## 7 0.8500 nan 0.1000 0.0093
## 8 0.8308 nan 0.1000 0.0095
## 9 0.8147 nan 0.1000 0.0080
## 10 0.7993 nan 0.1000 0.0076
## 20 0.7161 nan 0.1000 0.0031
## 40 0.6499 nan 0.1000 0.0008
## 60 0.6215 nan 0.1000 0.0005
## 80 0.6070 nan 0.1000 0.0001
## 100 0.5963 nan 0.1000 0.0001
## 120 0.5899 nan 0.1000 0.0000
## 140 0.5845 nan 0.1000 0.0003
## 150 0.5810 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0585 nan 0.1000 0.0186
## 2 1.0283 nan 0.1000 0.0150
## 3 1.0034 nan 0.1000 0.0121
## 4 0.9829 nan 0.1000 0.0102
## 5 0.9638 nan 0.1000 0.0097
## 6 0.9499 nan 0.1000 0.0071
## 7 0.9329 nan 0.1000 0.0083
## 8 0.9174 nan 0.1000 0.0077
## 9 0.9044 nan 0.1000 0.0064
## 10 0.8950 nan 0.1000 0.0048
## 20 0.8149 nan 0.1000 0.0023
## 40 0.7363 nan 0.1000 0.0012
## 60 0.6957 nan 0.1000 0.0008
## 80 0.6709 nan 0.1000 0.0003
## 100 0.6556 nan 0.1000 0.0005
## 120 0.6460 nan 0.1000 0.0001
## 140 0.6373 nan 0.1000 0.0001
## 150 0.6344 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0437 nan 0.1000 0.0259
## 2 1.0041 nan 0.1000 0.0196
## 3 0.9713 nan 0.1000 0.0163
## 4 0.9453 nan 0.1000 0.0131
## 5 0.9162 nan 0.1000 0.0147
## 6 0.8925 nan 0.1000 0.0119
## 7 0.8731 nan 0.1000 0.0098
## 8 0.8588 nan 0.1000 0.0073
## 9 0.8471 nan 0.1000 0.0058
## 10 0.8339 nan 0.1000 0.0066
## 20 0.7482 nan 0.1000 0.0033
## 40 0.6791 nan 0.1000 0.0007
## 60 0.6498 nan 0.1000 0.0006
## 80 0.6340 nan 0.1000 0.0004
## 100 0.6219 nan 0.1000 0.0001
## 120 0.6137 nan 0.1000 0.0000
## 140 0.6068 nan 0.1000 0.0002
## 150 0.6043 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0371 nan 0.1000 0.0291
## 2 0.9892 nan 0.1000 0.0238
## 3 0.9517 nan 0.1000 0.0186
## 4 0.9208 nan 0.1000 0.0154
## 5 0.8912 nan 0.1000 0.0147
## 6 0.8697 nan 0.1000 0.0107
## 7 0.8489 nan 0.1000 0.0102
## 8 0.8306 nan 0.1000 0.0089
## 9 0.8170 nan 0.1000 0.0066
## 10 0.8056 nan 0.1000 0.0055
## 20 0.7221 nan 0.1000 0.0018
## 40 0.6537 nan 0.1000 0.0006
## 60 0.6255 nan 0.1000 0.0004
## 80 0.6093 nan 0.1000 0.0003
## 100 0.5993 nan 0.1000 0.0000
## 120 0.5928 nan 0.1000 0.0001
## 140 0.5870 nan 0.1000 -0.0001
## 150 0.5845 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0580 nan 0.1000 0.0186
## 2 1.0284 nan 0.1000 0.0150
## 3 1.0034 nan 0.1000 0.0122
## 4 0.9831 nan 0.1000 0.0101
## 5 0.9685 nan 0.1000 0.0072
## 6 0.9494 nan 0.1000 0.0096
## 7 0.9327 nan 0.1000 0.0083
## 8 0.9172 nan 0.1000 0.0078
## 9 0.9041 nan 0.1000 0.0063
## 10 0.8911 nan 0.1000 0.0063
## 20 0.8149 nan 0.1000 0.0022
## 40 0.7350 nan 0.1000 0.0013
## 60 0.6951 nan 0.1000 0.0007
## 80 0.6696 nan 0.1000 0.0002
## 100 0.6547 nan 0.1000 0.0002
## 120 0.6433 nan 0.1000 0.0002
## 140 0.6357 nan 0.1000 0.0001
## 150 0.6325 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0432 nan 0.1000 0.0261
## 2 1.0034 nan 0.1000 0.0199
## 3 0.9708 nan 0.1000 0.0161
## 4 0.9355 nan 0.1000 0.0172
## 5 0.9081 nan 0.1000 0.0137
## 6 0.8892 nan 0.1000 0.0096
## 7 0.8691 nan 0.1000 0.0101
## 8 0.8535 nan 0.1000 0.0078
## 9 0.8403 nan 0.1000 0.0063
## 10 0.8286 nan 0.1000 0.0059
## 20 0.7466 nan 0.1000 0.0029
## 40 0.6757 nan 0.1000 0.0013
## 60 0.6467 nan 0.1000 0.0003
## 80 0.6310 nan 0.1000 0.0004
## 100 0.6190 nan 0.1000 0.0003
## 120 0.6105 nan 0.1000 0.0001
## 140 0.6041 nan 0.1000 0.0000
## 150 0.6020 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0352 nan 0.1000 0.0297
## 2 0.9910 nan 0.1000 0.0221
## 3 0.9520 nan 0.1000 0.0192
## 4 0.9209 nan 0.1000 0.0156
## 5 0.8949 nan 0.1000 0.0131
## 6 0.8734 nan 0.1000 0.0106
## 7 0.8518 nan 0.1000 0.0106
## 8 0.8331 nan 0.1000 0.0094
## 9 0.8178 nan 0.1000 0.0076
## 10 0.8051 nan 0.1000 0.0063
## 20 0.7192 nan 0.1000 0.0030
## 40 0.6520 nan 0.1000 0.0007
## 60 0.6260 nan 0.1000 0.0003
## 80 0.6105 nan 0.1000 0.0001
## 100 0.5997 nan 0.1000 0.0001
## 120 0.5928 nan 0.1000 0.0001
## 140 0.5877 nan 0.1000 0.0000
## 150 0.5846 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0581 nan 0.1000 0.0187
## 2 1.0283 nan 0.1000 0.0151
## 3 1.0030 nan 0.1000 0.0122
## 4 0.9827 nan 0.1000 0.0099
## 5 0.9635 nan 0.1000 0.0098
## 6 0.9496 nan 0.1000 0.0067
## 7 0.9333 nan 0.1000 0.0082
## 8 0.9179 nan 0.1000 0.0078
## 9 0.9046 nan 0.1000 0.0063
## 10 0.8916 nan 0.1000 0.0064
## 20 0.8132 nan 0.1000 0.0033
## 40 0.7348 nan 0.1000 0.0018
## 60 0.6939 nan 0.1000 0.0008
## 80 0.6701 nan 0.1000 0.0005
## 100 0.6539 nan 0.1000 0.0003
## 120 0.6446 nan 0.1000 0.0003
## 140 0.6360 nan 0.1000 0.0001
## 150 0.6332 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0429 nan 0.1000 0.0264
## 2 1.0034 nan 0.1000 0.0200
## 3 0.9702 nan 0.1000 0.0170
## 4 0.9434 nan 0.1000 0.0133
## 5 0.9140 nan 0.1000 0.0146
## 6 0.8907 nan 0.1000 0.0118
## 7 0.8713 nan 0.1000 0.0096
## 8 0.8562 nan 0.1000 0.0075
## 9 0.8419 nan 0.1000 0.0070
## 10 0.8307 nan 0.1000 0.0056
## 20 0.7492 nan 0.1000 0.0024
## 40 0.6763 nan 0.1000 0.0009
## 60 0.6458 nan 0.1000 0.0007
## 80 0.6305 nan 0.1000 0.0002
## 100 0.6191 nan 0.1000 0.0001
## 120 0.6114 nan 0.1000 0.0001
## 140 0.6048 nan 0.1000 0.0000
## 150 0.6018 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0366 nan 0.1000 0.0293
## 2 0.9886 nan 0.1000 0.0237
## 3 0.9516 nan 0.1000 0.0187
## 4 0.9193 nan 0.1000 0.0159
## 5 0.8936 nan 0.1000 0.0130
## 6 0.8684 nan 0.1000 0.0126
## 7 0.8497 nan 0.1000 0.0095
## 8 0.8314 nan 0.1000 0.0091
## 9 0.8182 nan 0.1000 0.0064
## 10 0.8054 nan 0.1000 0.0061
## 20 0.7195 nan 0.1000 0.0027
## 40 0.6526 nan 0.1000 0.0009
## 60 0.6265 nan 0.1000 0.0006
## 80 0.6108 nan 0.1000 0.0003
## 100 0.5995 nan 0.1000 0.0003
## 120 0.5920 nan 0.1000 0.0001
## 140 0.5856 nan 0.1000 0.0000
## 150 0.5833 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0588 nan 0.1000 0.0187
## 2 1.0282 nan 0.1000 0.0151
## 3 1.0037 nan 0.1000 0.0123
## 4 0.9837 nan 0.1000 0.0101
## 5 0.9639 nan 0.1000 0.0098
## 6 0.9499 nan 0.1000 0.0069
## 7 0.9339 nan 0.1000 0.0079
## 8 0.9186 nan 0.1000 0.0078
## 9 0.9087 nan 0.1000 0.0048
## 10 0.8955 nan 0.1000 0.0064
## 20 0.8145 nan 0.1000 0.0033
## 40 0.7372 nan 0.1000 0.0015
## 60 0.6971 nan 0.1000 0.0008
## 80 0.6717 nan 0.1000 0.0006
## 100 0.6566 nan 0.1000 0.0002
## 120 0.6451 nan 0.1000 0.0001
## 140 0.6369 nan 0.1000 0.0001
## 150 0.6341 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0438 nan 0.1000 0.0256
## 2 1.0041 nan 0.1000 0.0194
## 3 0.9705 nan 0.1000 0.0167
## 4 0.9440 nan 0.1000 0.0132
## 5 0.9153 nan 0.1000 0.0145
## 6 0.8912 nan 0.1000 0.0116
## 7 0.8717 nan 0.1000 0.0096
## 8 0.8576 nan 0.1000 0.0071
## 9 0.8437 nan 0.1000 0.0069
## 10 0.8317 nan 0.1000 0.0060
## 20 0.7500 nan 0.1000 0.0025
## 40 0.6773 nan 0.1000 0.0010
## 60 0.6483 nan 0.1000 0.0004
## 80 0.6320 nan 0.1000 0.0002
## 100 0.6223 nan 0.1000 0.0001
## 120 0.6129 nan 0.1000 0.0001
## 140 0.6073 nan 0.1000 0.0001
## 150 0.6038 nan 0.1000 0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0366 nan 0.1000 0.0298
## 2 0.9914 nan 0.1000 0.0227
## 3 0.9517 nan 0.1000 0.0200
## 4 0.9208 nan 0.1000 0.0152
## 5 0.8956 nan 0.1000 0.0126
## 6 0.8733 nan 0.1000 0.0112
## 7 0.8518 nan 0.1000 0.0109
## 8 0.8338 nan 0.1000 0.0091
## 9 0.8182 nan 0.1000 0.0077
## 10 0.8044 nan 0.1000 0.0066
## 20 0.7205 nan 0.1000 0.0030
## 40 0.6553 nan 0.1000 0.0009
## 60 0.6267 nan 0.1000 0.0005
## 80 0.6114 nan 0.1000 0.0006
## 100 0.6006 nan 0.1000 0.0003
## 120 0.5932 nan 0.1000 0.0000
## 140 0.5877 nan 0.1000 0.0000
## 150 0.5848 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0589 nan 0.1000 0.0184
## 2 1.0289 nan 0.1000 0.0149
## 3 1.0043 nan 0.1000 0.0121
## 4 0.9840 nan 0.1000 0.0108
## 5 0.9643 nan 0.1000 0.0096
## 6 0.9503 nan 0.1000 0.0071
## 7 0.9335 nan 0.1000 0.0084
## 8 0.9184 nan 0.1000 0.0076
## 9 0.9085 nan 0.1000 0.0049
## 10 0.8961 nan 0.1000 0.0064
## 20 0.8131 nan 0.1000 0.0032
## 40 0.7352 nan 0.1000 0.0011
## 60 0.6947 nan 0.1000 0.0005
## 80 0.6709 nan 0.1000 0.0007
## 100 0.6549 nan 0.1000 0.0004
## 120 0.6440 nan 0.1000 0.0001
## 140 0.6362 nan 0.1000 0.0003
## 150 0.6334 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0438 nan 0.1000 0.0259
## 2 1.0047 nan 0.1000 0.0196
## 3 0.9717 nan 0.1000 0.0167
## 4 0.9431 nan 0.1000 0.0136
## 5 0.9143 nan 0.1000 0.0142
## 6 0.8912 nan 0.1000 0.0117
## 7 0.8747 nan 0.1000 0.0081
## 8 0.8596 nan 0.1000 0.0073
## 9 0.8435 nan 0.1000 0.0079
## 10 0.8299 nan 0.1000 0.0069
## 20 0.7514 nan 0.1000 0.0020
## 40 0.6773 nan 0.1000 0.0008
## 60 0.6473 nan 0.1000 0.0004
## 80 0.6311 nan 0.1000 0.0004
## 100 0.6200 nan 0.1000 0.0001
## 120 0.6123 nan 0.1000 0.0001
## 140 0.6058 nan 0.1000 0.0002
## 150 0.6036 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0367 nan 0.1000 0.0294
## 2 0.9898 nan 0.1000 0.0235
## 3 0.9548 nan 0.1000 0.0176
## 4 0.9233 nan 0.1000 0.0157
## 5 0.8969 nan 0.1000 0.0130
## 6 0.8716 nan 0.1000 0.0128
## 7 0.8497 nan 0.1000 0.0108
## 8 0.8321 nan 0.1000 0.0091
## 9 0.8183 nan 0.1000 0.0066
## 10 0.8035 nan 0.1000 0.0072
## 20 0.7202 nan 0.1000 0.0025
## 40 0.6526 nan 0.1000 0.0010
## 60 0.6256 nan 0.1000 0.0004
## 80 0.6098 nan 0.1000 0.0002
## 100 0.6003 nan 0.1000 -0.0000
## 120 0.5918 nan 0.1000 0.0001
## 140 0.5863 nan 0.1000 0.0001
## 150 0.5838 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0577 nan 0.1000 0.0185
## 2 1.0273 nan 0.1000 0.0148
## 3 1.0032 nan 0.1000 0.0120
## 4 0.9825 nan 0.1000 0.0102
## 5 0.9642 nan 0.1000 0.0089
## 6 0.9451 nan 0.1000 0.0095
## 7 0.9317 nan 0.1000 0.0065
## 8 0.9165 nan 0.1000 0.0077
## 9 0.9033 nan 0.1000 0.0065
## 10 0.8935 nan 0.1000 0.0047
## 20 0.8113 nan 0.1000 0.0033
## 40 0.7342 nan 0.1000 0.0015
## 60 0.6921 nan 0.1000 0.0005
## 80 0.6674 nan 0.1000 0.0007
## 100 0.6519 nan 0.1000 0.0004
## 120 0.6409 nan 0.1000 0.0001
## 140 0.6341 nan 0.1000 0.0000
## 150 0.6305 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0431 nan 0.1000 0.0255
## 2 1.0036 nan 0.1000 0.0197
## 3 0.9705 nan 0.1000 0.0164
## 4 0.9430 nan 0.1000 0.0135
## 5 0.9134 nan 0.1000 0.0147
## 6 0.8945 nan 0.1000 0.0093
## 7 0.8736 nan 0.1000 0.0103
## 8 0.8568 nan 0.1000 0.0083
## 9 0.8431 nan 0.1000 0.0068
## 10 0.8296 nan 0.1000 0.0068
## 20 0.7481 nan 0.1000 0.0031
## 40 0.6746 nan 0.1000 0.0011
## 60 0.6438 nan 0.1000 0.0005
## 80 0.6291 nan 0.1000 0.0002
## 100 0.6172 nan 0.1000 0.0001
## 120 0.6090 nan 0.1000 0.0002
## 140 0.6017 nan 0.1000 0.0001
## 150 0.5993 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0361 nan 0.1000 0.0298
## 2 0.9893 nan 0.1000 0.0233
## 3 0.9519 nan 0.1000 0.0185
## 4 0.9192 nan 0.1000 0.0163
## 5 0.8928 nan 0.1000 0.0131
## 6 0.8705 nan 0.1000 0.0110
## 7 0.8487 nan 0.1000 0.0109
## 8 0.8336 nan 0.1000 0.0074
## 9 0.8190 nan 0.1000 0.0070
## 10 0.8032 nan 0.1000 0.0077
## 20 0.7153 nan 0.1000 0.0029
## 40 0.6495 nan 0.1000 0.0009
## 60 0.6226 nan 0.1000 0.0004
## 80 0.6080 nan 0.1000 0.0002
## 100 0.5976 nan 0.1000 0.0002
## 120 0.5888 nan 0.1000 0.0001
## 140 0.5837 nan 0.1000 0.0001
## 150 0.5811 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0580 nan 0.1000 0.0187
## 2 1.0276 nan 0.1000 0.0150
## 3 1.0034 nan 0.1000 0.0122
## 4 0.9832 nan 0.1000 0.0097
## 5 0.9634 nan 0.1000 0.0098
## 6 0.9463 nan 0.1000 0.0083
## 7 0.9331 nan 0.1000 0.0065
## 8 0.9175 nan 0.1000 0.0078
## 9 0.9041 nan 0.1000 0.0064
## 10 0.8918 nan 0.1000 0.0062
## 20 0.8138 nan 0.1000 0.0034
## 40 0.7352 nan 0.1000 0.0012
## 60 0.6944 nan 0.1000 0.0011
## 80 0.6704 nan 0.1000 0.0007
## 100 0.6559 nan 0.1000 0.0004
## 120 0.6445 nan 0.1000 0.0002
## 140 0.6364 nan 0.1000 0.0002
## 150 0.6337 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0437 nan 0.1000 0.0258
## 2 1.0026 nan 0.1000 0.0198
## 3 0.9692 nan 0.1000 0.0163
## 4 0.9418 nan 0.1000 0.0137
## 5 0.9128 nan 0.1000 0.0141
## 6 0.8896 nan 0.1000 0.0116
## 7 0.8699 nan 0.1000 0.0096
## 8 0.8556 nan 0.1000 0.0072
## 9 0.8404 nan 0.1000 0.0076
## 10 0.8310 nan 0.1000 0.0046
## 20 0.7502 nan 0.1000 0.0032
## 40 0.6768 nan 0.1000 0.0009
## 60 0.6461 nan 0.1000 0.0004
## 80 0.6303 nan 0.1000 0.0002
## 100 0.6198 nan 0.1000 0.0003
## 120 0.6112 nan 0.1000 0.0002
## 140 0.6053 nan 0.1000 0.0000
## 150 0.6025 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0356 nan 0.1000 0.0293
## 2 0.9890 nan 0.1000 0.0231
## 3 0.9507 nan 0.1000 0.0188
## 4 0.9195 nan 0.1000 0.0157
## 5 0.8943 nan 0.1000 0.0126
## 6 0.8718 nan 0.1000 0.0114
## 7 0.8525 nan 0.1000 0.0095
## 8 0.8340 nan 0.1000 0.0095
## 9 0.8184 nan 0.1000 0.0078
## 10 0.8068 nan 0.1000 0.0058
## 20 0.7227 nan 0.1000 0.0034
## 40 0.6531 nan 0.1000 0.0010
## 60 0.6266 nan 0.1000 0.0005
## 80 0.6107 nan 0.1000 0.0004
## 100 0.6016 nan 0.1000 0.0000
## 120 0.5928 nan 0.1000 0.0001
## 140 0.5878 nan 0.1000 0.0002
## 150 0.5858 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0581 nan 0.1000 0.0186
## 2 1.0290 nan 0.1000 0.0150
## 3 1.0044 nan 0.1000 0.0122
## 4 0.9839 nan 0.1000 0.0103
## 5 0.9644 nan 0.1000 0.0098
## 6 0.9501 nan 0.1000 0.0069
## 7 0.9333 nan 0.1000 0.0082
## 8 0.9180 nan 0.1000 0.0078
## 9 0.9078 nan 0.1000 0.0050
## 10 0.8954 nan 0.1000 0.0062
## 20 0.8121 nan 0.1000 0.0033
## 40 0.7347 nan 0.1000 0.0015
## 60 0.6944 nan 0.1000 0.0007
## 80 0.6690 nan 0.1000 0.0006
## 100 0.6535 nan 0.1000 0.0002
## 120 0.6427 nan 0.1000 0.0002
## 140 0.6344 nan 0.1000 0.0000
## 150 0.6313 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0447 nan 0.1000 0.0259
## 2 1.0046 nan 0.1000 0.0202
## 3 0.9711 nan 0.1000 0.0170
## 4 0.9442 nan 0.1000 0.0135
## 5 0.9151 nan 0.1000 0.0144
## 6 0.8918 nan 0.1000 0.0117
## 7 0.8743 nan 0.1000 0.0087
## 8 0.8583 nan 0.1000 0.0081
## 9 0.8421 nan 0.1000 0.0081
## 10 0.8298 nan 0.1000 0.0062
## 20 0.7475 nan 0.1000 0.0031
## 40 0.6756 nan 0.1000 0.0008
## 60 0.6462 nan 0.1000 0.0003
## 80 0.6286 nan 0.1000 0.0003
## 100 0.6184 nan 0.1000 0.0001
## 120 0.6094 nan 0.1000 0.0000
## 140 0.6031 nan 0.1000 0.0000
## 150 0.6003 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0355 nan 0.1000 0.0295
## 2 0.9890 nan 0.1000 0.0234
## 3 0.9514 nan 0.1000 0.0185
## 4 0.9186 nan 0.1000 0.0159
## 5 0.8930 nan 0.1000 0.0127
## 6 0.8702 nan 0.1000 0.0111
## 7 0.8479 nan 0.1000 0.0113
## 8 0.8301 nan 0.1000 0.0090
## 9 0.8170 nan 0.1000 0.0063
## 10 0.8044 nan 0.1000 0.0062
## 20 0.7174 nan 0.1000 0.0029
## 40 0.6523 nan 0.1000 0.0006
## 60 0.6258 nan 0.1000 0.0002
## 80 0.6097 nan 0.1000 0.0002
## 100 0.5988 nan 0.1000 -0.0000
## 120 0.5908 nan 0.1000 0.0002
## 140 0.5857 nan 0.1000 0.0000
## 150 0.5831 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0578 nan 0.1000 0.0189
## 2 1.0275 nan 0.1000 0.0152
## 3 1.0026 nan 0.1000 0.0124
## 4 0.9818 nan 0.1000 0.0101
## 5 0.9619 nan 0.1000 0.0099
## 6 0.9450 nan 0.1000 0.0084
## 7 0.9286 nan 0.1000 0.0080
## 8 0.9160 nan 0.1000 0.0066
## 9 0.9035 nan 0.1000 0.0063
## 10 0.8938 nan 0.1000 0.0048
## 20 0.8112 nan 0.1000 0.0033
## 40 0.7333 nan 0.1000 0.0012
## 60 0.6926 nan 0.1000 0.0005
## 80 0.6669 nan 0.1000 0.0007
## 100 0.6509 nan 0.1000 0.0002
## 120 0.6411 nan 0.1000 0.0001
## 140 0.6329 nan 0.1000 0.0003
## 150 0.6291 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0435 nan 0.1000 0.0261
## 2 1.0030 nan 0.1000 0.0200
## 3 0.9711 nan 0.1000 0.0158
## 4 0.9424 nan 0.1000 0.0144
## 5 0.9126 nan 0.1000 0.0147
## 6 0.8883 nan 0.1000 0.0118
## 7 0.8684 nan 0.1000 0.0097
## 8 0.8541 nan 0.1000 0.0072
## 9 0.8400 nan 0.1000 0.0069
## 10 0.8283 nan 0.1000 0.0057
## 20 0.7489 nan 0.1000 0.0027
## 40 0.6751 nan 0.1000 0.0012
## 60 0.6447 nan 0.1000 0.0004
## 80 0.6287 nan 0.1000 0.0002
## 100 0.6185 nan 0.1000 0.0001
## 120 0.6103 nan 0.1000 0.0001
## 140 0.6036 nan 0.1000 0.0001
## 150 0.6010 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0357 nan 0.1000 0.0296
## 2 0.9892 nan 0.1000 0.0237
## 3 0.9507 nan 0.1000 0.0189
## 4 0.9196 nan 0.1000 0.0156
## 5 0.8927 nan 0.1000 0.0135
## 6 0.8704 nan 0.1000 0.0110
## 7 0.8481 nan 0.1000 0.0110
## 8 0.8297 nan 0.1000 0.0090
## 9 0.8168 nan 0.1000 0.0064
## 10 0.8025 nan 0.1000 0.0068
## 20 0.7194 nan 0.1000 0.0031
## 40 0.6509 nan 0.1000 0.0012
## 60 0.6246 nan 0.1000 0.0001
## 80 0.6093 nan 0.1000 0.0002
## 100 0.5991 nan 0.1000 0.0001
## 120 0.5911 nan 0.1000 0.0000
## 140 0.5859 nan 0.1000 0.0000
## 150 0.5834 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0589 nan 0.1000 0.0186
## 2 1.0292 nan 0.1000 0.0150
## 3 1.0050 nan 0.1000 0.0122
## 4 0.9847 nan 0.1000 0.0100
## 5 0.9655 nan 0.1000 0.0098
## 6 0.9481 nan 0.1000 0.0084
## 7 0.9350 nan 0.1000 0.0067
## 8 0.9201 nan 0.1000 0.0078
## 9 0.9099 nan 0.1000 0.0050
## 10 0.8975 nan 0.1000 0.0061
## 20 0.8166 nan 0.1000 0.0023
## 40 0.7378 nan 0.1000 0.0011
## 60 0.6980 nan 0.1000 0.0013
## 80 0.6730 nan 0.1000 0.0005
## 100 0.6567 nan 0.1000 0.0002
## 120 0.6464 nan 0.1000 0.0002
## 140 0.6383 nan 0.1000 0.0001
## 150 0.6353 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0436 nan 0.1000 0.0256
## 2 1.0040 nan 0.1000 0.0196
## 3 0.9709 nan 0.1000 0.0166
## 4 0.9430 nan 0.1000 0.0136
## 5 0.9144 nan 0.1000 0.0144
## 6 0.8915 nan 0.1000 0.0115
## 7 0.8722 nan 0.1000 0.0096
## 8 0.8576 nan 0.1000 0.0073
## 9 0.8457 nan 0.1000 0.0058
## 10 0.8338 nan 0.1000 0.0061
## 20 0.7518 nan 0.1000 0.0033
## 40 0.6794 nan 0.1000 0.0011
## 60 0.6498 nan 0.1000 0.0005
## 80 0.6326 nan 0.1000 0.0003
## 100 0.6213 nan 0.1000 0.0001
## 120 0.6132 nan 0.1000 0.0001
## 140 0.6062 nan 0.1000 0.0001
## 150 0.6033 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0360 nan 0.1000 0.0292
## 2 0.9893 nan 0.1000 0.0232
## 3 0.9515 nan 0.1000 0.0186
## 4 0.9204 nan 0.1000 0.0154
## 5 0.8916 nan 0.1000 0.0140
## 6 0.8697 nan 0.1000 0.0108
## 7 0.8485 nan 0.1000 0.0107
## 8 0.8311 nan 0.1000 0.0087
## 9 0.8185 nan 0.1000 0.0061
## 10 0.8057 nan 0.1000 0.0063
## 20 0.7200 nan 0.1000 0.0028
## 40 0.6555 nan 0.1000 0.0010
## 60 0.6296 nan 0.1000 0.0006
## 80 0.6138 nan 0.1000 0.0002
## 100 0.6036 nan 0.1000 0.0003
## 120 0.5960 nan 0.1000 0.0000
## 140 0.5897 nan 0.1000 0.0000
## 150 0.5877 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0582 nan 0.1000 0.0186
## 2 1.0283 nan 0.1000 0.0150
## 3 1.0041 nan 0.1000 0.0122
## 4 0.9837 nan 0.1000 0.0105
## 5 0.9645 nan 0.1000 0.0098
## 6 0.9468 nan 0.1000 0.0086
## 7 0.9332 nan 0.1000 0.0064
## 8 0.9171 nan 0.1000 0.0078
## 9 0.9071 nan 0.1000 0.0049
## 10 0.8946 nan 0.1000 0.0062
## 20 0.8153 nan 0.1000 0.0026
## 40 0.7344 nan 0.1000 0.0015
## 60 0.6945 nan 0.1000 0.0005
## 80 0.6695 nan 0.1000 0.0005
## 100 0.6545 nan 0.1000 0.0003
## 120 0.6432 nan 0.1000 0.0002
## 140 0.6353 nan 0.1000 0.0001
## 150 0.6325 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0440 nan 0.1000 0.0261
## 2 1.0041 nan 0.1000 0.0200
## 3 0.9711 nan 0.1000 0.0165
## 4 0.9436 nan 0.1000 0.0137
## 5 0.9144 nan 0.1000 0.0147
## 6 0.8912 nan 0.1000 0.0120
## 7 0.8718 nan 0.1000 0.0098
## 8 0.8568 nan 0.1000 0.0074
## 9 0.8448 nan 0.1000 0.0059
## 10 0.8328 nan 0.1000 0.0059
## 20 0.7495 nan 0.1000 0.0029
## 40 0.6758 nan 0.1000 0.0007
## 60 0.6462 nan 0.1000 0.0006
## 80 0.6302 nan 0.1000 0.0004
## 100 0.6196 nan 0.1000 0.0001
## 120 0.6108 nan 0.1000 0.0001
## 140 0.6049 nan 0.1000 0.0000
## 150 0.6024 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0358 nan 0.1000 0.0294
## 2 0.9890 nan 0.1000 0.0233
## 3 0.9520 nan 0.1000 0.0184
## 4 0.9196 nan 0.1000 0.0165
## 5 0.8936 nan 0.1000 0.0127
## 6 0.8682 nan 0.1000 0.0127
## 7 0.8475 nan 0.1000 0.0100
## 8 0.8312 nan 0.1000 0.0081
## 9 0.8156 nan 0.1000 0.0078
## 10 0.8012 nan 0.1000 0.0072
## 20 0.7166 nan 0.1000 0.0031
## 40 0.6518 nan 0.1000 0.0010
## 60 0.6261 nan 0.1000 0.0004
## 80 0.6106 nan 0.1000 0.0003
## 100 0.6004 nan 0.1000 -0.0000
## 120 0.5923 nan 0.1000 0.0001
## 140 0.5868 nan 0.1000 0.0001
## 150 0.5843 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0589 nan 0.1000 0.0185
## 2 1.0286 nan 0.1000 0.0149
## 3 1.0049 nan 0.1000 0.0121
## 4 0.9847 nan 0.1000 0.0101
## 5 0.9653 nan 0.1000 0.0097
## 6 0.9479 nan 0.1000 0.0086
## 7 0.9346 nan 0.1000 0.0067
## 8 0.9193 nan 0.1000 0.0078
## 9 0.9093 nan 0.1000 0.0049
## 10 0.8966 nan 0.1000 0.0062
## 20 0.8168 nan 0.1000 0.0035
## 40 0.7368 nan 0.1000 0.0011
## 60 0.6972 nan 0.1000 0.0005
## 80 0.6714 nan 0.1000 0.0004
## 100 0.6563 nan 0.1000 0.0005
## 120 0.6463 nan 0.1000 0.0001
## 140 0.6381 nan 0.1000 0.0001
## 150 0.6349 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0435 nan 0.1000 0.0261
## 2 1.0040 nan 0.1000 0.0196
## 3 0.9723 nan 0.1000 0.0155
## 4 0.9441 nan 0.1000 0.0141
## 5 0.9151 nan 0.1000 0.0146
## 6 0.8919 nan 0.1000 0.0116
## 7 0.8727 nan 0.1000 0.0096
## 8 0.8574 nan 0.1000 0.0077
## 9 0.8428 nan 0.1000 0.0073
## 10 0.8320 nan 0.1000 0.0053
## 20 0.7515 nan 0.1000 0.0028
## 40 0.6798 nan 0.1000 0.0006
## 60 0.6494 nan 0.1000 0.0003
## 80 0.6329 nan 0.1000 0.0001
## 100 0.6214 nan 0.1000 0.0001
## 120 0.6133 nan 0.1000 0.0003
## 140 0.6070 nan 0.1000 0.0001
## 150 0.6044 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0368 nan 0.1000 0.0299
## 2 0.9906 nan 0.1000 0.0228
## 3 0.9529 nan 0.1000 0.0187
## 4 0.9219 nan 0.1000 0.0157
## 5 0.8963 nan 0.1000 0.0130
## 6 0.8736 nan 0.1000 0.0111
## 7 0.8525 nan 0.1000 0.0105
## 8 0.8362 nan 0.1000 0.0079
## 9 0.8200 nan 0.1000 0.0080
## 10 0.8084 nan 0.1000 0.0059
## 20 0.7230 nan 0.1000 0.0021
## 40 0.6562 nan 0.1000 0.0009
## 60 0.6286 nan 0.1000 0.0002
## 80 0.6135 nan 0.1000 0.0002
## 100 0.6020 nan 0.1000 0.0002
## 120 0.5950 nan 0.1000 0.0001
## 140 0.5896 nan 0.1000 0.0001
## 150 0.5879 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0577 nan 0.1000 0.0186
## 2 1.0281 nan 0.1000 0.0150
## 3 1.0038 nan 0.1000 0.0122
## 4 0.9832 nan 0.1000 0.0105
## 5 0.9638 nan 0.1000 0.0097
## 6 0.9498 nan 0.1000 0.0072
## 7 0.9331 nan 0.1000 0.0083
## 8 0.9181 nan 0.1000 0.0077
## 9 0.9082 nan 0.1000 0.0049
## 10 0.8952 nan 0.1000 0.0062
## 20 0.8124 nan 0.1000 0.0034
## 40 0.7361 nan 0.1000 0.0010
## 60 0.6940 nan 0.1000 0.0005
## 80 0.6701 nan 0.1000 0.0003
## 100 0.6530 nan 0.1000 0.0004
## 120 0.6437 nan 0.1000 0.0000
## 140 0.6354 nan 0.1000 0.0000
## 150 0.6323 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0436 nan 0.1000 0.0258
## 2 1.0049 nan 0.1000 0.0197
## 3 0.9733 nan 0.1000 0.0155
## 4 0.9445 nan 0.1000 0.0144
## 5 0.9153 nan 0.1000 0.0148
## 6 0.8913 nan 0.1000 0.0120
## 7 0.8717 nan 0.1000 0.0098
## 8 0.8572 nan 0.1000 0.0072
## 9 0.8430 nan 0.1000 0.0070
## 10 0.8313 nan 0.1000 0.0060
## 20 0.7475 nan 0.1000 0.0037
## 40 0.6765 nan 0.1000 0.0012
## 60 0.6460 nan 0.1000 0.0002
## 80 0.6293 nan 0.1000 0.0001
## 100 0.6177 nan 0.1000 0.0002
## 120 0.6090 nan 0.1000 0.0001
## 140 0.6031 nan 0.1000 0.0000
## 150 0.6004 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0359 nan 0.1000 0.0297
## 2 0.9896 nan 0.1000 0.0233
## 3 0.9514 nan 0.1000 0.0191
## 4 0.9202 nan 0.1000 0.0158
## 5 0.8937 nan 0.1000 0.0128
## 6 0.8692 nan 0.1000 0.0123
## 7 0.8483 nan 0.1000 0.0104
## 8 0.8330 nan 0.1000 0.0076
## 9 0.8168 nan 0.1000 0.0082
## 10 0.8043 nan 0.1000 0.0063
## 20 0.7204 nan 0.1000 0.0030
## 40 0.6545 nan 0.1000 0.0009
## 60 0.6269 nan 0.1000 0.0004
## 80 0.6115 nan 0.1000 0.0000
## 100 0.6003 nan 0.1000 0.0002
## 120 0.5920 nan 0.1000 0.0000
## 140 0.5855 nan 0.1000 0.0000
## 150 0.5834 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0579 nan 0.1000 0.0188
## 2 1.0283 nan 0.1000 0.0151
## 3 1.0031 nan 0.1000 0.0124
## 4 0.9827 nan 0.1000 0.0099
## 5 0.9629 nan 0.1000 0.0099
## 6 0.9457 nan 0.1000 0.0087
## 7 0.9323 nan 0.1000 0.0066
## 8 0.9161 nan 0.1000 0.0078
## 9 0.9029 nan 0.1000 0.0063
## 10 0.8932 nan 0.1000 0.0049
## 20 0.8127 nan 0.1000 0.0034
## 40 0.7320 nan 0.1000 0.0015
## 60 0.6930 nan 0.1000 0.0012
## 80 0.6677 nan 0.1000 0.0005
## 100 0.6532 nan 0.1000 0.0002
## 120 0.6410 nan 0.1000 0.0002
## 140 0.6336 nan 0.1000 0.0001
## 150 0.6308 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0434 nan 0.1000 0.0264
## 2 1.0037 nan 0.1000 0.0202
## 3 0.9704 nan 0.1000 0.0168
## 4 0.9437 nan 0.1000 0.0133
## 5 0.9137 nan 0.1000 0.0149
## 6 0.8900 nan 0.1000 0.0118
## 7 0.8704 nan 0.1000 0.0099
## 8 0.8550 nan 0.1000 0.0076
## 9 0.8404 nan 0.1000 0.0072
## 10 0.8292 nan 0.1000 0.0057
## 20 0.7462 nan 0.1000 0.0034
## 40 0.6745 nan 0.1000 0.0009
## 60 0.6445 nan 0.1000 0.0006
## 80 0.6290 nan 0.1000 0.0002
## 100 0.6176 nan 0.1000 0.0001
## 120 0.6101 nan 0.1000 0.0001
## 140 0.6047 nan 0.1000 0.0001
## 150 0.6019 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0356 nan 0.1000 0.0300
## 2 0.9887 nan 0.1000 0.0236
## 3 0.9516 nan 0.1000 0.0185
## 4 0.9193 nan 0.1000 0.0162
## 5 0.8919 nan 0.1000 0.0132
## 6 0.8699 nan 0.1000 0.0112
## 7 0.8477 nan 0.1000 0.0109
## 8 0.8291 nan 0.1000 0.0090
## 9 0.8131 nan 0.1000 0.0078
## 10 0.8015 nan 0.1000 0.0055
## 20 0.7186 nan 0.1000 0.0032
## 40 0.6518 nan 0.1000 0.0009
## 60 0.6236 nan 0.1000 0.0003
## 80 0.6103 nan 0.1000 0.0002
## 100 0.5994 nan 0.1000 0.0001
## 120 0.5924 nan 0.1000 -0.0000
## 140 0.5860 nan 0.1000 0.0001
## 150 0.5836 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0578 nan 0.1000 0.0187
## 2 1.0276 nan 0.1000 0.0150
## 3 1.0029 nan 0.1000 0.0122
## 4 0.9828 nan 0.1000 0.0100
## 5 0.9631 nan 0.1000 0.0098
## 6 0.9492 nan 0.1000 0.0067
## 7 0.9331 nan 0.1000 0.0081
## 8 0.9176 nan 0.1000 0.0078
## 9 0.9050 nan 0.1000 0.0063
## 10 0.8951 nan 0.1000 0.0048
## 20 0.8148 nan 0.1000 0.0022
## 40 0.7354 nan 0.1000 0.0015
## 60 0.6953 nan 0.1000 0.0005
## 80 0.6709 nan 0.1000 0.0003
## 100 0.6559 nan 0.1000 0.0002
## 120 0.6441 nan 0.1000 0.0003
## 140 0.6361 nan 0.1000 0.0001
## 150 0.6332 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0439 nan 0.1000 0.0258
## 2 1.0049 nan 0.1000 0.0199
## 3 0.9711 nan 0.1000 0.0165
## 4 0.9446 nan 0.1000 0.0131
## 5 0.9152 nan 0.1000 0.0148
## 6 0.8917 nan 0.1000 0.0121
## 7 0.8721 nan 0.1000 0.0097
## 8 0.8574 nan 0.1000 0.0070
## 9 0.8433 nan 0.1000 0.0070
## 10 0.8319 nan 0.1000 0.0056
## 20 0.7507 nan 0.1000 0.0031
## 40 0.6784 nan 0.1000 0.0012
## 60 0.6484 nan 0.1000 0.0004
## 80 0.6309 nan 0.1000 0.0003
## 100 0.6197 nan 0.1000 0.0001
## 120 0.6108 nan 0.1000 0.0001
## 140 0.6050 nan 0.1000 0.0002
## 150 0.6028 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0363 nan 0.1000 0.0291
## 2 0.9890 nan 0.1000 0.0236
## 3 0.9518 nan 0.1000 0.0188
## 4 0.9205 nan 0.1000 0.0160
## 5 0.8954 nan 0.1000 0.0123
## 6 0.8732 nan 0.1000 0.0113
## 7 0.8542 nan 0.1000 0.0093
## 8 0.8382 nan 0.1000 0.0079
## 9 0.8214 nan 0.1000 0.0082
## 10 0.8061 nan 0.1000 0.0078
## 20 0.7234 nan 0.1000 0.0023
## 40 0.6544 nan 0.1000 0.0008
## 60 0.6275 nan 0.1000 0.0005
## 80 0.6109 nan 0.1000 0.0001
## 100 0.6005 nan 0.1000 0.0000
## 120 0.5926 nan 0.1000 0.0001
## 140 0.5870 nan 0.1000 0.0000
## 150 0.5850 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0590 nan 0.1000 0.0184
## 2 1.0286 nan 0.1000 0.0148
## 3 1.0044 nan 0.1000 0.0120
## 4 0.9842 nan 0.1000 0.0099
## 5 0.9653 nan 0.1000 0.0096
## 6 0.9481 nan 0.1000 0.0085
## 7 0.9321 nan 0.1000 0.0078
## 8 0.9192 nan 0.1000 0.0064
## 9 0.9057 nan 0.1000 0.0064
## 10 0.8962 nan 0.1000 0.0045
## 20 0.8155 nan 0.1000 0.0028
## 40 0.7380 nan 0.1000 0.0009
## 60 0.6962 nan 0.1000 0.0007
## 80 0.6712 nan 0.1000 0.0005
## 100 0.6558 nan 0.1000 0.0004
## 120 0.6448 nan 0.1000 0.0001
## 140 0.6378 nan 0.1000 0.0001
## 150 0.6350 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0434 nan 0.1000 0.0257
## 2 1.0038 nan 0.1000 0.0197
## 3 0.9706 nan 0.1000 0.0163
## 4 0.9440 nan 0.1000 0.0131
## 5 0.9148 nan 0.1000 0.0145
## 6 0.8919 nan 0.1000 0.0115
## 7 0.8761 nan 0.1000 0.0077
## 8 0.8603 nan 0.1000 0.0079
## 9 0.8440 nan 0.1000 0.0081
## 10 0.8321 nan 0.1000 0.0058
## 20 0.7496 nan 0.1000 0.0033
## 40 0.6781 nan 0.1000 0.0013
## 60 0.6477 nan 0.1000 0.0003
## 80 0.6314 nan 0.1000 0.0002
## 100 0.6196 nan 0.1000 0.0003
## 120 0.6113 nan 0.1000 0.0002
## 140 0.6053 nan 0.1000 0.0001
## 150 0.6032 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0359 nan 0.1000 0.0300
## 2 0.9894 nan 0.1000 0.0233
## 3 0.9526 nan 0.1000 0.0186
## 4 0.9219 nan 0.1000 0.0152
## 5 0.8947 nan 0.1000 0.0134
## 6 0.8693 nan 0.1000 0.0126
## 7 0.8506 nan 0.1000 0.0095
## 8 0.8321 nan 0.1000 0.0091
## 9 0.8166 nan 0.1000 0.0076
## 10 0.8025 nan 0.1000 0.0069
## 20 0.7209 nan 0.1000 0.0029
## 40 0.6554 nan 0.1000 0.0011
## 60 0.6288 nan 0.1000 0.0004
## 80 0.6132 nan 0.1000 0.0000
## 100 0.6023 nan 0.1000 0.0001
## 120 0.5950 nan 0.1000 0.0001
## 140 0.5891 nan 0.1000 0.0002
## 150 0.5868 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0576 nan 0.1000 0.0187
## 2 1.0279 nan 0.1000 0.0150
## 3 1.0032 nan 0.1000 0.0122
## 4 0.9830 nan 0.1000 0.0099
## 5 0.9636 nan 0.1000 0.0098
## 6 0.9496 nan 0.1000 0.0070
## 7 0.9326 nan 0.1000 0.0083
## 8 0.9171 nan 0.1000 0.0078
## 9 0.9043 nan 0.1000 0.0064
## 10 0.8947 nan 0.1000 0.0048
## 20 0.8137 nan 0.1000 0.0031
## 40 0.7345 nan 0.1000 0.0013
## 60 0.6933 nan 0.1000 0.0005
## 80 0.6692 nan 0.1000 0.0006
## 100 0.6525 nan 0.1000 0.0001
## 120 0.6413 nan 0.1000 0.0004
## 140 0.6345 nan 0.1000 0.0003
## 150 0.6303 nan 0.1000 0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0427 nan 0.1000 0.0261
## 2 1.0033 nan 0.1000 0.0197
## 3 0.9705 nan 0.1000 0.0164
## 4 0.9435 nan 0.1000 0.0138
## 5 0.9144 nan 0.1000 0.0144
## 6 0.8914 nan 0.1000 0.0119
## 7 0.8719 nan 0.1000 0.0096
## 8 0.8569 nan 0.1000 0.0073
## 9 0.8450 nan 0.1000 0.0059
## 10 0.8323 nan 0.1000 0.0065
## 20 0.7495 nan 0.1000 0.0031
## 40 0.6751 nan 0.1000 0.0008
## 60 0.6454 nan 0.1000 0.0005
## 80 0.6285 nan 0.1000 0.0003
## 100 0.6174 nan 0.1000 0.0001
## 120 0.6084 nan 0.1000 0.0001
## 140 0.6018 nan 0.1000 0.0001
## 150 0.5993 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0360 nan 0.1000 0.0294
## 2 0.9888 nan 0.1000 0.0230
## 3 0.9519 nan 0.1000 0.0185
## 4 0.9189 nan 0.1000 0.0165
## 5 0.8933 nan 0.1000 0.0127
## 6 0.8682 nan 0.1000 0.0125
## 7 0.8490 nan 0.1000 0.0095
## 8 0.8307 nan 0.1000 0.0090
## 9 0.8141 nan 0.1000 0.0084
## 10 0.8000 nan 0.1000 0.0069
## 20 0.7168 nan 0.1000 0.0026
## 40 0.6492 nan 0.1000 0.0008
## 60 0.6242 nan 0.1000 0.0004
## 80 0.6081 nan 0.1000 0.0002
## 100 0.5975 nan 0.1000 0.0002
## 120 0.5901 nan 0.1000 -0.0000
## 140 0.5841 nan 0.1000 0.0000
## 150 0.5819 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0578 nan 0.1000 0.0186
## 2 1.0282 nan 0.1000 0.0150
## 3 1.0042 nan 0.1000 0.0122
## 4 0.9839 nan 0.1000 0.0097
## 5 0.9644 nan 0.1000 0.0098
## 6 0.9472 nan 0.1000 0.0085
## 7 0.9340 nan 0.1000 0.0062
## 8 0.9178 nan 0.1000 0.0078
## 9 0.9043 nan 0.1000 0.0063
## 10 0.8947 nan 0.1000 0.0046
## 20 0.8143 nan 0.1000 0.0031
## 40 0.7347 nan 0.1000 0.0015
## 60 0.6947 nan 0.1000 0.0009
## 80 0.6698 nan 0.1000 0.0004
## 100 0.6542 nan 0.1000 0.0002
## 120 0.6424 nan 0.1000 0.0001
## 140 0.6347 nan 0.1000 0.0001
## 150 0.6316 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0443 nan 0.1000 0.0262
## 2 1.0046 nan 0.1000 0.0199
## 3 0.9716 nan 0.1000 0.0167
## 4 0.9439 nan 0.1000 0.0139
## 5 0.9151 nan 0.1000 0.0146
## 6 0.8917 nan 0.1000 0.0118
## 7 0.8721 nan 0.1000 0.0096
## 8 0.8574 nan 0.1000 0.0075
## 9 0.8452 nan 0.1000 0.0061
## 10 0.8324 nan 0.1000 0.0064
## 20 0.7493 nan 0.1000 0.0024
## 40 0.6751 nan 0.1000 0.0012
## 60 0.6458 nan 0.1000 0.0003
## 80 0.6286 nan 0.1000 0.0005
## 100 0.6166 nan 0.1000 0.0004
## 120 0.6092 nan 0.1000 0.0001
## 140 0.6020 nan 0.1000 0.0000
## 150 0.5996 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0365 nan 0.1000 0.0300
## 2 0.9892 nan 0.1000 0.0232
## 3 0.9524 nan 0.1000 0.0183
## 4 0.9203 nan 0.1000 0.0161
## 5 0.8943 nan 0.1000 0.0131
## 6 0.8719 nan 0.1000 0.0110
## 7 0.8527 nan 0.1000 0.0094
## 8 0.8336 nan 0.1000 0.0094
## 9 0.8171 nan 0.1000 0.0080
## 10 0.8034 nan 0.1000 0.0068
## 20 0.7198 nan 0.1000 0.0023
## 40 0.6533 nan 0.1000 0.0008
## 60 0.6264 nan 0.1000 0.0002
## 80 0.6089 nan 0.1000 0.0003
## 100 0.5989 nan 0.1000 0.0001
## 120 0.5904 nan 0.1000 0.0001
## 140 0.5842 nan 0.1000 -0.0000
## 150 0.5822 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.0364 nan 0.1000 0.0292
## 2 0.9899 nan 0.1000 0.0230
## 3 0.9530 nan 0.1000 0.0189
## 4 0.9207 nan 0.1000 0.0164
## 5 0.8942 nan 0.1000 0.0129
## 6 0.8729 nan 0.1000 0.0109
## 7 0.8514 nan 0.1000 0.0108
## 8 0.8331 nan 0.1000 0.0092
## 9 0.8200 nan 0.1000 0.0063
## 10 0.8074 nan 0.1000 0.0063
## 20 0.7214 nan 0.1000 0.0030
## 40 0.6544 nan 0.1000 0.0009
## 60 0.6264 nan 0.1000 0.0003
## 80 0.6110 nan 0.1000 0.0003
## 100 0.5996 nan 0.1000 0.0001
## 120 0.5926 nan 0.1000 0.0000
## 140 0.5873 nan 0.1000 -0.0000
## 150 0.5848 nan 0.1000 0.0001
summary(boostingtrain)
## var rel.inf
## Married.civ.spouse Married.civ.spouse 36.52279385
## capital.gain capital.gain 19.98614726
## education.num education.num 19.43215771
## age age 6.59421528
## capital.loss capital.loss 6.30308676
## hours.per.week hours.per.week 4.36684944
## Exec.managerial Exec.managerial 2.06255757
## Wife Wife 0.69164792
## Prof.specialty Prof.specialty 0.61752910
## Farming.fishing Farming.fishing 0.55237470
## Self.emp.not.inc Self.emp.not.inc 0.49422938
## Other.service Other.service 0.41444610
## Male Male 0.39370763
## Tech.support Tech.support 0.32076657
## Married.AF.spouse Married.AF.spouse 0.17353512
## Sales Sales 0.16611691
## Local.gov Local.gov 0.14324173
## Not.in.family Not.in.family 0.13164689
## Self.emp.inc Self.emp.inc 0.09836303
## Machine.op.inspct Machine.op.inspct 0.09136793
## White White 0.08946089
## Protective.serv Protective.serv 0.08671965
## United.States United.States 0.07037183
## Handlers.cleaners Handlers.cleaners 0.06623173
## Own.child Own.child 0.05166759
## Philippines Philippines 0.04248981
## Widowed Widowed 0.01356904
## State.gov State.gov 0.01209187
## Transport.moving Transport.moving 0.01061673
## No.gain No.gain 0.00000000
## Private Private 0.00000000
## Married.spouse.absent Married.spouse.absent 0.00000000
## Never.married Never.married 0.00000000
## Separated Separated 0.00000000
## Armed.Forces Armed.Forces 0.00000000
## Craft.repair Craft.repair 0.00000000
## Priv.house.serv Priv.house.serv 0.00000000
## Other.relative Other.relative 0.00000000
## Unmarried Unmarried 0.00000000
## Asian.Pac.Islander Asian.Pac.Islander 0.00000000
## Black Black 0.00000000
## Other Other 0.00000000
## other_countries other_countries 0.00000000
boostingtrain
## Stochastic Gradient Boosting
##
## 32402 samples
## 43 predictor
## 2 classes: '<=50K', '>50K'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.8431146 0.4816248
## 1 100 0.8524843 0.5377526
## 1 150 0.8540584 0.5478247
## 2 50 0.8532744 0.5423939
## 2 100 0.8570582 0.5620259
## 2 150 0.8604901 0.5781097
## 3 50 0.8552002 0.5510964
## 3 100 0.8612740 0.5805378
## 3 150 0.8642429 0.5933076
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
boostingtrain$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 9 150 3 0.1 10
boostingtrain$results
## shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa
## 1 0.1 1 10 50 0.8431146 0.4816248
## 4 0.1 2 10 50 0.8532744 0.5423939
## 7 0.1 3 10 50 0.8552002 0.5510964
## 2 0.1 1 10 100 0.8524843 0.5377526
## 5 0.1 2 10 100 0.8570582 0.5620259
## 8 0.1 3 10 100 0.8612740 0.5805378
## 3 0.1 1 10 150 0.8540584 0.5478247
## 6 0.1 2 10 150 0.8604901 0.5781097
## 9 0.1 3 10 150 0.8642429 0.5933076
## AccuracySD KappaSD
## 1 0.004809954 0.01989279
## 4 0.005088176 0.01718602
## 7 0.005030751 0.01692701
## 2 0.005360968 0.01868402
## 5 0.005063951 0.01689716
## 8 0.004921149 0.01572846
## 3 0.005253151 0.01767353
## 6 0.004820025 0.01558992
## 9 0.004983303 0.01554980
boostingtrain$finalModel
## A gradient boosted model with bernoulli loss function.
## 150 iterations were performed.
## There were 43 predictors of which 29 had non-zero influence.
boostingtrain$resample
## Accuracy Kappa Resample
## 1 0.8731481 0.6167872 Fold04.Rep5
## 2 0.8673249 0.6005170 Fold01.Rep4
## 3 0.8623457 0.5871899 Fold03.Rep5
## 4 0.8756173 0.6304083 Fold10.Rep3
## 5 0.8604938 0.5754363 Fold05.Rep5
## 6 0.8672840 0.6073511 Fold02.Rep5
## 7 0.8608454 0.5856595 Fold09.Rep3
## 8 0.8567901 0.5705294 Fold02.Rep4
## 9 0.8642394 0.5943947 Fold01.Rep5
## 10 0.8614198 0.5842108 Fold08.Rep3
## 11 0.8716049 0.6145822 Fold09.Rep2
## 12 0.8568343 0.5676637 Fold06.Rep5
## 13 0.8608025 0.5851871 Fold10.Rep4
## 14 0.8604938 0.5733285 Fold07.Rep3
## 15 0.8694444 0.6113164 Fold08.Rep2
## 16 0.8673249 0.6051431 Fold03.Rep4
## 17 0.8638889 0.5904222 Fold09.Rep4
## 18 0.8577160 0.5739284 Fold08.Rep4
## 19 0.8611540 0.5835627 Fold07.Rep2
## 20 0.8657407 0.5955996 Fold10.Rep2
## 21 0.8722222 0.6190340 Fold07.Rep5
## 22 0.8632716 0.5905646 Fold07.Rep4
## 23 0.8660907 0.5987596 Fold06.Rep2
## 24 0.8570988 0.5737375 Fold07.Rep1
## 25 0.8672840 0.6012228 Fold04.Rep4
## 26 0.8604938 0.5808202 Fold04.Rep3
## 27 0.8632716 0.5873554 Fold05.Rep2
## 28 0.8703704 0.6108762 Fold06.Rep1
## 29 0.8645480 0.5970731 Fold01.Rep3
## 30 0.8679012 0.6050041 Fold08.Rep5
## 31 0.8580247 0.5670664 Fold06.Rep3
## 32 0.8638889 0.5924131 Fold04.Rep2
## 33 0.8608025 0.5827651 Fold08.Rep1
## 34 0.8626543 0.5871106 Fold05.Rep4
## 35 0.8641975 0.5962809 Fold05.Rep3
## 36 0.8651235 0.5925450 Fold03.Rep2
## 37 0.8663580 0.5982447 Fold04.Rep1
## 38 0.8614198 0.5901983 Fold02.Rep3
## 39 0.8657407 0.5952038 Fold09.Rep5
## 40 0.8580247 0.5771138 Fold02.Rep2
## 41 0.8645062 0.5954414 Fold03.Rep1
## 42 0.8707189 0.6131911 Fold09.Rep1
## 43 0.8719136 0.6138151 Fold06.Rep4
## 44 0.8558642 0.5671205 Fold01.Rep2
## 45 0.8651651 0.5957897 Fold02.Rep1
## 46 0.8558642 0.5666981 Fold05.Rep1
## 47 0.8688272 0.6113568 Fold03.Rep3
## 48 0.8570988 0.5849905 Fold10.Rep5
## 49 0.8688272 0.6102321 Fold10.Rep1
## 50 0.8700617 0.6101397 Fold01.Rep1
boostingtrain$resampledCM
## shrinkage interaction.depth n.minobsinnode n.trees cell1 cell2 cell3
## 1 0.1 1 10 150 2357 115 352
## 2 0.1 1 10 50 2402 70 427
## 3 0.1 1 10 100 2364 108 363
## 4 0.1 2 10 150 2355 117 313
## 5 0.1 2 10 50 2363 109 362
## 6 0.1 2 10 100 2362 110 334
## 7 0.1 3 10 150 2349 123 298
## 8 0.1 3 10 50 2359 113 355
## 9 0.1 3 10 100 2354 118 320
## 10 0.1 1 10 150 2354 118 356
## 11 0.1 1 10 50 2404 68 458
## 12 0.1 1 10 100 2363 109 371
## 13 0.1 2 10 150 2340 132 324
## 14 0.1 2 10 50 2366 106 361
## 15 0.1 2 10 100 2349 123 340
## 16 0.1 3 10 150 2341 131 306
## 17 0.1 3 10 50 2356 116 354
## 18 0.1 3 10 100 2338 134 318
## 19 0.1 1 10 150 2365 107 357
## 20 0.1 1 10 50 2405 67 440
## 21 0.1 1 10 100 2382 90 372
## 22 0.1 2 10 150 2345 127 322
## 23 0.1 2 10 50 2374 98 368
## 24 0.1 2 10 100 2360 112 346
## 25 0.1 3 10 150 2335 137 302
## 26 0.1 3 10 50 2370 102 360
## 27 0.1 3 10 100 2342 130 320
## 28 0.1 1 10 150 2340 132 353
## 29 0.1 1 10 50 2401 71 428
## 30 0.1 1 10 100 2352 120 369
## 31 0.1 2 10 150 2341 131 322
## 32 0.1 2 10 50 2344 128 360
## 33 0.1 2 10 100 2339 133 338
## 34 0.1 3 10 150 2345 127 306
## 35 0.1 3 10 50 2347 125 356
## 36 0.1 3 10 100 2345 127 325
## 37 0.1 1 10 150 2343 129 371
## 38 0.1 1 10 50 2394 78 437
## 39 0.1 1 10 100 2346 126 374
## 40 0.1 2 10 150 2325 147 331
## 41 0.1 2 10 50 2345 127 370
## 42 0.1 2 10 100 2343 129 353
## 43 0.1 3 10 150 2328 144 323
## 44 0.1 3 10 50 2343 129 363
## 45 0.1 3 10 100 2331 141 336
## 46 0.1 1 10 150 2360 112 347
## 47 0.1 1 10 50 2404 68 435
## 48 0.1 1 10 100 2368 104 359
## 49 0.1 2 10 150 2346 126 311
## 50 0.1 2 10 50 2362 110 359
## 51 0.1 2 10 100 2349 123 336
## 52 0.1 3 10 150 2350 122 298
## 53 0.1 3 10 50 2362 110 347
## 54 0.1 3 10 100 2351 121 311
## 55 0.1 1 10 150 2347 125 364
## 56 0.1 1 10 50 2360 112 426
## 57 0.1 1 10 100 2354 118 374
## 58 0.1 2 10 150 2324 148 323
## 59 0.1 2 10 50 2342 130 359
## 60 0.1 2 10 100 2333 139 341
## 61 0.1 3 10 150 2322 150 313
## 62 0.1 3 10 50 2342 130 357
## 63 0.1 3 10 100 2329 143 327
## 64 0.1 1 10 150 2354 118 362
## 65 0.1 1 10 50 2408 64 449
## 66 0.1 1 10 100 2373 99 391
## 67 0.1 2 10 150 2341 131 330
## 68 0.1 2 10 50 2368 104 378
## 69 0.1 2 10 100 2349 123 348
## 70 0.1 3 10 150 2333 139 312
## 71 0.1 3 10 50 2362 110 366
## 72 0.1 3 10 100 2346 126 321
## 73 0.1 1 10 150 2364 108 360
## 74 0.1 1 10 50 2410 62 438
## 75 0.1 1 10 100 2371 101 373
## 76 0.1 2 10 150 2352 120 314
## 77 0.1 2 10 50 2372 100 366
## 78 0.1 2 10 100 2360 112 336
## 79 0.1 3 10 150 2348 124 295
## 80 0.1 3 10 50 2366 106 350
## 81 0.1 3 10 100 2350 122 317
## 82 0.1 1 10 150 2365 107 347
## 83 0.1 1 10 50 2403 69 429
## 84 0.1 1 10 100 2374 98 358
## 85 0.1 2 10 150 2337 135 309
## 86 0.1 2 10 50 2371 101 356
## 87 0.1 2 10 100 2355 117 335
## 88 0.1 3 10 150 2337 135 290
## 89 0.1 3 10 50 2367 105 352
## 90 0.1 3 10 100 2345 127 308
## 91 0.1 1 10 150 2352 120 367
## 92 0.1 1 10 50 2403 69 457
## 93 0.1 1 10 100 2364 108 393
## 94 0.1 2 10 150 2330 142 332
## 95 0.1 2 10 50 2361 111 379
## 96 0.1 2 10 100 2339 133 352
## 97 0.1 3 10 150 2327 145 322
## 98 0.1 3 10 50 2351 121 368
## 99 0.1 3 10 100 2334 138 334
## 100 0.1 1 10 150 2336 136 361
## 101 0.1 1 10 50 2387 85 434
## 102 0.1 1 10 100 2343 129 374
## 103 0.1 2 10 150 2323 149 317
## 104 0.1 2 10 50 2342 130 367
## 105 0.1 2 10 100 2333 139 348
## 106 0.1 3 10 150 2322 150 310
## 107 0.1 3 10 50 2342 130 360
## 108 0.1 3 10 100 2325 147 324
## 109 0.1 1 10 150 2354 118 353
## 110 0.1 1 10 50 2402 70 425
## 111 0.1 1 10 100 2362 110 363
## 112 0.1 2 10 150 2351 121 328
## 113 0.1 2 10 50 2359 113 357
## 114 0.1 2 10 100 2356 116 333
## 115 0.1 3 10 150 2348 124 313
## 116 0.1 3 10 50 2360 112 351
## 117 0.1 3 10 100 2351 121 321
## 118 0.1 1 10 150 2361 111 353
## 119 0.1 1 10 50 2399 73 426
## 120 0.1 1 10 100 2365 107 363
## 121 0.1 2 10 150 2342 130 315
## 122 0.1 2 10 50 2360 112 360
## 123 0.1 2 10 100 2351 121 336
## 124 0.1 3 10 150 2337 135 306
## 125 0.1 3 10 50 2359 113 357
## 126 0.1 3 10 100 2343 129 315
## 127 0.1 1 10 150 2348 124 365
## 128 0.1 1 10 50 2396 76 440
## 129 0.1 1 10 100 2356 116 373
## 130 0.1 2 10 150 2344 128 323
## 131 0.1 2 10 50 2359 113 371
## 132 0.1 2 10 100 2349 123 351
## 133 0.1 3 10 150 2344 128 315
## 134 0.1 3 10 50 2355 117 361
## 135 0.1 3 10 100 2346 126 326
## 136 0.1 1 10 150 2364 108 344
## 137 0.1 1 10 50 2404 68 428
## 138 0.1 1 10 100 2370 102 356
## 139 0.1 2 10 150 2339 133 315
## 140 0.1 2 10 50 2365 107 354
## 141 0.1 2 10 100 2346 126 333
## 142 0.1 3 10 150 2342 130 304
## 143 0.1 3 10 50 2359 113 344
## 144 0.1 3 10 100 2343 129 314
## 145 0.1 1 10 150 2353 119 366
## 146 0.1 1 10 50 2399 73 428
## 147 0.1 1 10 100 2368 104 379
## 148 0.1 2 10 150 2335 137 329
## 149 0.1 2 10 50 2351 121 365
## 150 0.1 2 10 100 2348 124 346
## 151 0.1 3 10 150 2335 137 313
## 152 0.1 3 10 50 2349 123 357
## 153 0.1 3 10 100 2339 133 328
## 154 0.1 1 10 150 2354 118 344
## 155 0.1 1 10 50 2408 64 445
## 156 0.1 1 10 100 2361 111 360
## 157 0.1 2 10 150 2344 128 313
## 158 0.1 2 10 50 2367 105 355
## 159 0.1 2 10 100 2350 122 327
## 160 0.1 3 10 150 2340 132 291
## 161 0.1 3 10 50 2363 109 340
## 162 0.1 3 10 100 2346 126 309
## 163 0.1 1 10 150 2368 104 360
## 164 0.1 1 10 50 2416 56 429
## 165 0.1 1 10 100 2379 93 370
## 166 0.1 2 10 150 2349 123 314
## 167 0.1 2 10 50 2374 98 370
## 168 0.1 2 10 100 2356 116 340
## 169 0.1 3 10 150 2352 120 296
## 170 0.1 3 10 50 2372 100 361
## 171 0.1 3 10 100 2353 119 317
## 172 0.1 1 10 150 2368 104 355
## 173 0.1 1 10 50 2405 67 456
## 174 0.1 1 10 100 2372 100 371
## 175 0.1 2 10 150 2345 127 322
## 176 0.1 2 10 50 2368 104 364
## 177 0.1 2 10 100 2362 110 341
## 178 0.1 3 10 150 2346 126 309
## 179 0.1 3 10 50 2372 100 359
## 180 0.1 3 10 100 2350 122 315
## 181 0.1 1 10 150 2350 122 337
## 182 0.1 1 10 50 2362 110 385
## 183 0.1 1 10 100 2353 119 349
## 184 0.1 2 10 150 2340 132 312
## 185 0.1 2 10 50 2349 123 342
## 186 0.1 2 10 100 2345 127 323
## 187 0.1 3 10 150 2332 140 299
## 188 0.1 3 10 50 2348 124 329
## 189 0.1 3 10 100 2337 135 309
## 190 0.1 1 10 150 2345 127 336
## 191 0.1 1 10 50 2388 84 427
## 192 0.1 1 10 100 2358 114 355
## 193 0.1 2 10 150 2329 143 308
## 194 0.1 2 10 50 2355 117 345
## 195 0.1 2 10 100 2342 130 320
## 196 0.1 3 10 150 2320 152 297
## 197 0.1 3 10 50 2351 121 336
## 198 0.1 3 10 100 2331 141 303
## 199 0.1 1 10 150 2355 117 336
## 200 0.1 1 10 50 2409 63 438
## 201 0.1 1 10 100 2365 107 356
## 202 0.1 2 10 150 2347 125 303
## 203 0.1 2 10 50 2362 110 355
## 204 0.1 2 10 100 2349 123 324
## 205 0.1 3 10 150 2334 138 287
## 206 0.1 3 10 50 2360 112 341
## 207 0.1 3 10 100 2342 130 304
## 208 0.1 1 10 150 2349 123 366
## 209 0.1 1 10 50 2404 68 453
## 210 0.1 1 10 100 2363 109 383
## 211 0.1 2 10 150 2343 129 327
## 212 0.1 2 10 50 2360 112 385
## 213 0.1 2 10 100 2346 126 341
## 214 0.1 3 10 150 2335 137 315
## 215 0.1 3 10 50 2355 117 366
## 216 0.1 3 10 100 2339 133 325
## 217 0.1 1 10 150 2345 127 356
## 218 0.1 1 10 50 2390 82 433
## 219 0.1 1 10 100 2346 126 363
## 220 0.1 2 10 150 2333 139 313
## 221 0.1 2 10 50 2346 126 356
## 222 0.1 2 10 100 2340 132 335
## 223 0.1 3 10 150 2330 142 298
## 224 0.1 3 10 50 2344 128 351
## 225 0.1 3 10 100 2336 136 315
## 226 0.1 1 10 150 2367 105 392
## 227 0.1 1 10 50 2409 63 469
## 228 0.1 1 10 100 2379 93 409
## 229 0.1 2 10 150 2352 120 353
## 230 0.1 2 10 50 2375 97 404
## 231 0.1 2 10 100 2360 112 375
## 232 0.1 3 10 150 2346 126 334
## 233 0.1 3 10 50 2367 105 391
## 234 0.1 3 10 100 2353 119 352
## 235 0.1 1 10 150 2365 107 389
## 236 0.1 1 10 50 2413 59 464
## 237 0.1 1 10 100 2375 97 396
## 238 0.1 2 10 150 2352 120 351
## 239 0.1 2 10 50 2375 97 389
## 240 0.1 2 10 100 2352 120 365
## 241 0.1 3 10 150 2353 119 333
## 242 0.1 3 10 50 2376 96 383
## 243 0.1 3 10 100 2356 116 350
## 244 0.1 1 10 150 2366 106 351
## 245 0.1 1 10 50 2410 62 431
## 246 0.1 1 10 100 2376 96 366
## 247 0.1 2 10 150 2348 124 327
## 248 0.1 2 10 50 2374 98 361
## 249 0.1 2 10 100 2359 113 339
## 250 0.1 3 10 150 2335 137 312
## 251 0.1 3 10 50 2373 99 354
## 252 0.1 3 10 100 2346 126 326
## 253 0.1 1 10 150 2346 126 355
## 254 0.1 1 10 50 2373 99 400
## 255 0.1 1 10 100 2356 116 365
## 256 0.1 2 10 150 2331 141 326
## 257 0.1 2 10 50 2351 121 359
## 258 0.1 2 10 100 2337 135 342
## 259 0.1 3 10 150 2327 145 306
## 260 0.1 3 10 50 2354 118 352
## 261 0.1 3 10 100 2323 149 321
## 262 0.1 1 10 150 2361 111 344
## 263 0.1 1 10 50 2400 72 418
## 264 0.1 1 10 100 2370 102 360
## 265 0.1 2 10 150 2357 115 305
## 266 0.1 2 10 50 2372 100 359
## 267 0.1 2 10 100 2357 115 333
## 268 0.1 3 10 150 2348 124 279
## 269 0.1 3 10 50 2367 105 354
## 270 0.1 3 10 100 2362 110 300
## 271 0.1 1 10 150 2365 107 360
## 272 0.1 1 10 50 2408 64 436
## 273 0.1 1 10 100 2369 103 364
## 274 0.1 2 10 150 2344 128 315
## 275 0.1 2 10 50 2364 108 352
## 276 0.1 2 10 100 2356 116 339
## 277 0.1 3 10 150 2349 123 307
## 278 0.1 3 10 50 2365 107 350
## 279 0.1 3 10 100 2353 119 319
## 280 0.1 1 10 150 2347 125 376
## 281 0.1 1 10 50 2399 73 479
## 282 0.1 1 10 100 2354 118 390
## 283 0.1 2 10 150 2325 147 335
## 284 0.1 2 10 50 2354 118 385
## 285 0.1 2 10 100 2336 136 361
## 286 0.1 3 10 150 2327 145 319
## 287 0.1 3 10 50 2346 126 373
## 288 0.1 3 10 100 2326 146 335
## 289 0.1 1 10 150 2357 115 349
## 290 0.1 1 10 50 2408 64 431
## 291 0.1 1 10 100 2371 101 355
## 292 0.1 2 10 150 2339 133 309
## 293 0.1 2 10 50 2370 102 355
## 294 0.1 2 10 100 2352 120 328
## 295 0.1 3 10 150 2337 135 295
## 296 0.1 3 10 50 2363 109 339
## 297 0.1 3 10 100 2346 126 305
## 298 0.1 1 10 150 2365 107 358
## 299 0.1 1 10 50 2378 94 414
## 300 0.1 1 10 100 2375 97 382
## 301 0.1 2 10 150 2356 116 325
## 302 0.1 2 10 50 2369 103 368
## 303 0.1 2 10 100 2365 107 340
## 304 0.1 3 10 150 2346 126 304
## 305 0.1 3 10 50 2368 104 360
## 306 0.1 3 10 100 2349 123 318
## 307 0.1 1 10 150 2365 107 354
## 308 0.1 1 10 50 2400 72 441
## 309 0.1 1 10 100 2364 108 373
## 310 0.1 2 10 150 2343 129 323
## 311 0.1 2 10 50 2359 113 365
## 312 0.1 2 10 100 2353 119 341
## 313 0.1 3 10 150 2339 133 312
## 314 0.1 3 10 50 2358 114 354
## 315 0.1 3 10 100 2350 122 323
## 316 0.1 1 10 150 2365 107 336
## 317 0.1 1 10 50 2417 55 429
## 318 0.1 1 10 100 2376 96 356
## 319 0.1 2 10 150 2358 114 303
## 320 0.1 2 10 50 2370 102 351
## 321 0.1 2 10 100 2353 119 316
## 322 0.1 3 10 150 2357 115 300
## 323 0.1 3 10 50 2362 110 341
## 324 0.1 3 10 100 2359 113 305
## 325 0.1 1 10 150 2349 123 345
## 326 0.1 1 10 50 2395 77 422
## 327 0.1 1 10 100 2362 110 356
## 328 0.1 2 10 150 2339 133 321
## 329 0.1 2 10 50 2358 114 351
## 330 0.1 2 10 100 2346 126 329
## 331 0.1 3 10 150 2336 136 307
## 332 0.1 3 10 50 2360 112 340
## 333 0.1 3 10 100 2335 137 317
## 334 0.1 1 10 150 2351 121 372
## 335 0.1 1 10 50 2393 79 465
## 336 0.1 1 10 100 2353 119 388
## 337 0.1 2 10 150 2335 137 325
## 338 0.1 2 10 50 2351 121 378
## 339 0.1 2 10 100 2348 124 359
## 340 0.1 3 10 150 2327 145 316
## 341 0.1 3 10 50 2350 122 368
## 342 0.1 3 10 100 2335 137 330
## 343 0.1 1 10 150 2357 115 348
## 344 0.1 1 10 50 2399 73 422
## 345 0.1 1 10 100 2367 105 358
## 346 0.1 2 10 150 2341 131 327
## 347 0.1 2 10 50 2361 111 359
## 348 0.1 2 10 100 2349 123 345
## 349 0.1 3 10 150 2342 130 311
## 350 0.1 3 10 50 2359 113 352
## 351 0.1 3 10 100 2349 123 327
## 352 0.1 1 10 150 2341 131 367
## 353 0.1 1 10 50 2387 85 436
## 354 0.1 1 10 100 2349 123 381
## 355 0.1 2 10 150 2332 140 330
## 356 0.1 2 10 50 2346 126 377
## 357 0.1 2 10 100 2337 135 350
## 358 0.1 3 10 150 2327 145 306
## 359 0.1 3 10 50 2344 128 367
## 360 0.1 3 10 100 2332 140 328
## 361 0.1 1 10 150 2352 120 355
## 362 0.1 1 10 50 2398 74 436
## 363 0.1 1 10 100 2364 108 369
## 364 0.1 2 10 150 2347 125 331
## 365 0.1 2 10 50 2364 108 367
## 366 0.1 2 10 100 2349 123 348
## 367 0.1 3 10 150 2336 136 304
## 368 0.1 3 10 50 2355 117 360
## 369 0.1 3 10 100 2343 129 330
## 370 0.1 1 10 150 2357 115 343
## 371 0.1 1 10 50 2401 71 424
## 372 0.1 1 10 100 2363 109 350
## 373 0.1 2 10 150 2342 130 309
## 374 0.1 2 10 50 2361 111 338
## 375 0.1 2 10 100 2350 122 320
## 376 0.1 3 10 150 2330 142 288
## 377 0.1 3 10 50 2358 114 329
## 378 0.1 3 10 100 2337 135 304
## 379 0.1 1 10 150 2348 124 364
## 380 0.1 1 10 50 2398 74 449
## 381 0.1 1 10 100 2357 115 379
## 382 0.1 2 10 150 2339 133 335
## 383 0.1 2 10 50 2358 114 378
## 384 0.1 2 10 100 2346 126 353
## 385 0.1 3 10 150 2336 136 310
## 386 0.1 3 10 50 2348 124 361
## 387 0.1 3 10 100 2336 136 323
## 388 0.1 1 10 150 2378 94 349
## 389 0.1 1 10 50 2414 58 438
## 390 0.1 1 10 100 2380 92 363
## 391 0.1 2 10 150 2364 108 320
## 392 0.1 2 10 50 2380 92 360
## 393 0.1 2 10 100 2376 96 335
## 394 0.1 3 10 150 2361 111 300
## 395 0.1 3 10 50 2376 96 348
## 396 0.1 3 10 100 2359 113 316
## 397 0.1 1 10 150 2356 116 375
## 398 0.1 1 10 50 2411 61 447
## 399 0.1 1 10 100 2364 108 390
## 400 0.1 2 10 150 2343 129 348
## 401 0.1 2 10 50 2357 115 378
## 402 0.1 2 10 100 2345 127 358
## 403 0.1 3 10 150 2348 124 328
## 404 0.1 3 10 50 2359 113 372
## 405 0.1 3 10 100 2351 121 344
## 406 0.1 1 10 150 2365 107 383
## 407 0.1 1 10 50 2404 68 467
## 408 0.1 1 10 100 2371 101 397
## 409 0.1 2 10 150 2342 130 344
## 410 0.1 2 10 50 2362 110 387
## 411 0.1 2 10 100 2353 119 369
## 412 0.1 3 10 150 2335 137 327
## 413 0.1 3 10 50 2367 105 385
## 414 0.1 3 10 100 2343 129 338
## 415 0.1 1 10 150 2380 92 336
## 416 0.1 1 10 50 2410 62 418
## 417 0.1 1 10 100 2384 88 343
## 418 0.1 2 10 150 2350 122 305
## 419 0.1 2 10 50 2384 88 342
## 420 0.1 2 10 100 2366 106 325
## 421 0.1 3 10 150 2346 126 288
## 422 0.1 3 10 50 2379 93 336
## 423 0.1 3 10 100 2357 115 304
## 424 0.1 1 10 150 2356 116 361
## 425 0.1 1 10 50 2406 66 440
## 426 0.1 1 10 100 2369 103 383
## 427 0.1 2 10 150 2348 124 324
## 428 0.1 2 10 50 2362 110 371
## 429 0.1 2 10 100 2350 122 337
## 430 0.1 3 10 150 2342 130 298
## 431 0.1 3 10 50 2361 111 358
## 432 0.1 3 10 100 2346 126 317
## 433 0.1 1 10 150 2349 123 363
## 434 0.1 1 10 50 2401 71 438
## 435 0.1 1 10 100 2356 116 380
## 436 0.1 2 10 150 2343 129 325
## 437 0.1 2 10 50 2355 117 371
## 438 0.1 2 10 100 2345 127 347
## 439 0.1 3 10 150 2347 125 310
## 440 0.1 3 10 50 2355 117 362
## 441 0.1 3 10 100 2340 132 324
## 442 0.1 1 10 150 2319 153 329
## 443 0.1 1 10 50 2385 87 412
## 444 0.1 1 10 100 2331 141 341
## 445 0.1 2 10 150 2310 162 296
## 446 0.1 2 10 50 2324 148 343
## 447 0.1 2 10 100 2313 159 311
## 448 0.1 3 10 150 2294 178 285
## 449 0.1 3 10 50 2316 156 329
## 450 0.1 3 10 100 2305 167 294
## cell4 Resample
## 1 416 Fold01.Rep1
## 2 341 Fold01.Rep1
## 3 405 Fold01.Rep1
## 4 455 Fold01.Rep1
## 5 406 Fold01.Rep1
## 6 434 Fold01.Rep1
## 7 470 Fold01.Rep1
## 8 413 Fold01.Rep1
## 9 448 Fold01.Rep1
## 10 413 Fold02.Rep1
## 11 311 Fold02.Rep1
## 12 398 Fold02.Rep1
## 13 445 Fold02.Rep1
## 14 408 Fold02.Rep1
## 15 429 Fold02.Rep1
## 16 463 Fold02.Rep1
## 17 415 Fold02.Rep1
## 18 451 Fold02.Rep1
## 19 411 Fold03.Rep1
## 20 328 Fold03.Rep1
## 21 396 Fold03.Rep1
## 22 446 Fold03.Rep1
## 23 400 Fold03.Rep1
## 24 422 Fold03.Rep1
## 25 466 Fold03.Rep1
## 26 408 Fold03.Rep1
## 27 448 Fold03.Rep1
## 28 415 Fold04.Rep1
## 29 340 Fold04.Rep1
## 30 399 Fold04.Rep1
## 31 446 Fold04.Rep1
## 32 408 Fold04.Rep1
## 33 430 Fold04.Rep1
## 34 462 Fold04.Rep1
## 35 412 Fold04.Rep1
## 36 443 Fold04.Rep1
## 37 397 Fold05.Rep1
## 38 331 Fold05.Rep1
## 39 394 Fold05.Rep1
## 40 437 Fold05.Rep1
## 41 398 Fold05.Rep1
## 42 415 Fold05.Rep1
## 43 445 Fold05.Rep1
## 44 405 Fold05.Rep1
## 45 432 Fold05.Rep1
## 46 421 Fold06.Rep1
## 47 333 Fold06.Rep1
## 48 409 Fold06.Rep1
## 49 457 Fold06.Rep1
## 50 409 Fold06.Rep1
## 51 432 Fold06.Rep1
## 52 470 Fold06.Rep1
## 53 421 Fold06.Rep1
## 54 457 Fold06.Rep1
## 55 404 Fold07.Rep1
## 56 342 Fold07.Rep1
## 57 394 Fold07.Rep1
## 58 445 Fold07.Rep1
## 59 409 Fold07.Rep1
## 60 427 Fold07.Rep1
## 61 455 Fold07.Rep1
## 62 411 Fold07.Rep1
## 63 441 Fold07.Rep1
## 64 406 Fold08.Rep1
## 65 319 Fold08.Rep1
## 66 377 Fold08.Rep1
## 67 438 Fold08.Rep1
## 68 390 Fold08.Rep1
## 69 420 Fold08.Rep1
## 70 456 Fold08.Rep1
## 71 402 Fold08.Rep1
## 72 447 Fold08.Rep1
## 73 409 Fold09.Rep1
## 74 331 Fold09.Rep1
## 75 396 Fold09.Rep1
## 76 455 Fold09.Rep1
## 77 403 Fold09.Rep1
## 78 433 Fold09.Rep1
## 79 474 Fold09.Rep1
## 80 419 Fold09.Rep1
## 81 452 Fold09.Rep1
## 82 421 Fold10.Rep1
## 83 339 Fold10.Rep1
## 84 410 Fold10.Rep1
## 85 459 Fold10.Rep1
## 86 412 Fold10.Rep1
## 87 433 Fold10.Rep1
## 88 478 Fold10.Rep1
## 89 416 Fold10.Rep1
## 90 460 Fold10.Rep1
## 91 401 Fold01.Rep2
## 92 311 Fold01.Rep2
## 93 375 Fold01.Rep2
## 94 436 Fold01.Rep2
## 95 389 Fold01.Rep2
## 96 416 Fold01.Rep2
## 97 446 Fold01.Rep2
## 98 400 Fold01.Rep2
## 99 434 Fold01.Rep2
## 100 407 Fold02.Rep2
## 101 334 Fold02.Rep2
## 102 394 Fold02.Rep2
## 103 451 Fold02.Rep2
## 104 401 Fold02.Rep2
## 105 420 Fold02.Rep2
## 106 458 Fold02.Rep2
## 107 408 Fold02.Rep2
## 108 444 Fold02.Rep2
## 109 415 Fold03.Rep2
## 110 343 Fold03.Rep2
## 111 405 Fold03.Rep2
## 112 440 Fold03.Rep2
## 113 411 Fold03.Rep2
## 114 435 Fold03.Rep2
## 115 455 Fold03.Rep2
## 116 417 Fold03.Rep2
## 117 447 Fold03.Rep2
## 118 415 Fold04.Rep2
## 119 342 Fold04.Rep2
## 120 405 Fold04.Rep2
## 121 453 Fold04.Rep2
## 122 408 Fold04.Rep2
## 123 432 Fold04.Rep2
## 124 462 Fold04.Rep2
## 125 411 Fold04.Rep2
## 126 453 Fold04.Rep2
## 127 403 Fold05.Rep2
## 128 328 Fold05.Rep2
## 129 395 Fold05.Rep2
## 130 445 Fold05.Rep2
## 131 397 Fold05.Rep2
## 132 417 Fold05.Rep2
## 133 453 Fold05.Rep2
## 134 407 Fold05.Rep2
## 135 442 Fold05.Rep2
## 136 425 Fold06.Rep2
## 137 341 Fold06.Rep2
## 138 413 Fold06.Rep2
## 139 454 Fold06.Rep2
## 140 415 Fold06.Rep2
## 141 436 Fold06.Rep2
## 142 465 Fold06.Rep2
## 143 425 Fold06.Rep2
## 144 455 Fold06.Rep2
## 145 403 Fold07.Rep2
## 146 341 Fold07.Rep2
## 147 390 Fold07.Rep2
## 148 440 Fold07.Rep2
## 149 404 Fold07.Rep2
## 150 423 Fold07.Rep2
## 151 456 Fold07.Rep2
## 152 412 Fold07.Rep2
## 153 441 Fold07.Rep2
## 154 424 Fold08.Rep2
## 155 323 Fold08.Rep2
## 156 408 Fold08.Rep2
## 157 455 Fold08.Rep2
## 158 413 Fold08.Rep2
## 159 441 Fold08.Rep2
## 160 477 Fold08.Rep2
## 161 428 Fold08.Rep2
## 162 459 Fold08.Rep2
## 163 408 Fold09.Rep2
## 164 339 Fold09.Rep2
## 165 398 Fold09.Rep2
## 166 454 Fold09.Rep2
## 167 398 Fold09.Rep2
## 168 428 Fold09.Rep2
## 169 472 Fold09.Rep2
## 170 407 Fold09.Rep2
## 171 451 Fold09.Rep2
## 172 413 Fold10.Rep2
## 173 312 Fold10.Rep2
## 174 397 Fold10.Rep2
## 175 446 Fold10.Rep2
## 176 404 Fold10.Rep2
## 177 427 Fold10.Rep2
## 178 459 Fold10.Rep2
## 179 409 Fold10.Rep2
## 180 453 Fold10.Rep2
## 181 432 Fold01.Rep3
## 182 384 Fold01.Rep3
## 183 420 Fold01.Rep3
## 184 457 Fold01.Rep3
## 185 427 Fold01.Rep3
## 186 446 Fold01.Rep3
## 187 470 Fold01.Rep3
## 188 440 Fold01.Rep3
## 189 460 Fold01.Rep3
## 190 432 Fold02.Rep3
## 191 341 Fold02.Rep3
## 192 413 Fold02.Rep3
## 193 460 Fold02.Rep3
## 194 423 Fold02.Rep3
## 195 448 Fold02.Rep3
## 196 471 Fold02.Rep3
## 197 432 Fold02.Rep3
## 198 465 Fold02.Rep3
## 199 432 Fold03.Rep3
## 200 330 Fold03.Rep3
## 201 412 Fold03.Rep3
## 202 465 Fold03.Rep3
## 203 413 Fold03.Rep3
## 204 444 Fold03.Rep3
## 205 481 Fold03.Rep3
## 206 427 Fold03.Rep3
## 207 464 Fold03.Rep3
## 208 402 Fold04.Rep3
## 209 315 Fold04.Rep3
## 210 385 Fold04.Rep3
## 211 441 Fold04.Rep3
## 212 383 Fold04.Rep3
## 213 427 Fold04.Rep3
## 214 453 Fold04.Rep3
## 215 402 Fold04.Rep3
## 216 443 Fold04.Rep3
## 217 412 Fold05.Rep3
## 218 335 Fold05.Rep3
## 219 405 Fold05.Rep3
## 220 455 Fold05.Rep3
## 221 412 Fold05.Rep3
## 222 433 Fold05.Rep3
## 223 470 Fold05.Rep3
## 224 417 Fold05.Rep3
## 225 453 Fold05.Rep3
## 226 376 Fold06.Rep3
## 227 299 Fold06.Rep3
## 228 359 Fold06.Rep3
## 229 415 Fold06.Rep3
## 230 364 Fold06.Rep3
## 231 393 Fold06.Rep3
## 232 434 Fold06.Rep3
## 233 377 Fold06.Rep3
## 234 416 Fold06.Rep3
## 235 379 Fold07.Rep3
## 236 304 Fold07.Rep3
## 237 372 Fold07.Rep3
## 238 417 Fold07.Rep3
## 239 379 Fold07.Rep3
## 240 403 Fold07.Rep3
## 241 435 Fold07.Rep3
## 242 385 Fold07.Rep3
## 243 418 Fold07.Rep3
## 244 417 Fold08.Rep3
## 245 337 Fold08.Rep3
## 246 402 Fold08.Rep3
## 247 441 Fold08.Rep3
## 248 407 Fold08.Rep3
## 249 429 Fold08.Rep3
## 250 456 Fold08.Rep3
## 251 414 Fold08.Rep3
## 252 442 Fold08.Rep3
## 253 414 Fold09.Rep3
## 254 369 Fold09.Rep3
## 255 404 Fold09.Rep3
## 256 443 Fold09.Rep3
## 257 410 Fold09.Rep3
## 258 427 Fold09.Rep3
## 259 463 Fold09.Rep3
## 260 417 Fold09.Rep3
## 261 448 Fold09.Rep3
## 262 424 Fold10.Rep3
## 263 350 Fold10.Rep3
## 264 408 Fold10.Rep3
## 265 463 Fold10.Rep3
## 266 409 Fold10.Rep3
## 267 435 Fold10.Rep3
## 268 489 Fold10.Rep3
## 269 414 Fold10.Rep3
## 270 468 Fold10.Rep3
## 271 409 Fold01.Rep4
## 272 333 Fold01.Rep4
## 273 405 Fold01.Rep4
## 274 454 Fold01.Rep4
## 275 417 Fold01.Rep4
## 276 430 Fold01.Rep4
## 277 462 Fold01.Rep4
## 278 419 Fold01.Rep4
## 279 450 Fold01.Rep4
## 280 392 Fold02.Rep4
## 281 289 Fold02.Rep4
## 282 378 Fold02.Rep4
## 283 433 Fold02.Rep4
## 284 383 Fold02.Rep4
## 285 407 Fold02.Rep4
## 286 449 Fold02.Rep4
## 287 395 Fold02.Rep4
## 288 433 Fold02.Rep4
## 289 420 Fold03.Rep4
## 290 338 Fold03.Rep4
## 291 414 Fold03.Rep4
## 292 460 Fold03.Rep4
## 293 414 Fold03.Rep4
## 294 441 Fold03.Rep4
## 295 474 Fold03.Rep4
## 296 430 Fold03.Rep4
## 297 464 Fold03.Rep4
## 298 410 Fold04.Rep4
## 299 354 Fold04.Rep4
## 300 386 Fold04.Rep4
## 301 443 Fold04.Rep4
## 302 400 Fold04.Rep4
## 303 428 Fold04.Rep4
## 304 464 Fold04.Rep4
## 305 408 Fold04.Rep4
## 306 450 Fold04.Rep4
## 307 414 Fold05.Rep4
## 308 327 Fold05.Rep4
## 309 395 Fold05.Rep4
## 310 445 Fold05.Rep4
## 311 403 Fold05.Rep4
## 312 427 Fold05.Rep4
## 313 456 Fold05.Rep4
## 314 414 Fold05.Rep4
## 315 445 Fold05.Rep4
## 316 432 Fold06.Rep4
## 317 339 Fold06.Rep4
## 318 412 Fold06.Rep4
## 319 465 Fold06.Rep4
## 320 417 Fold06.Rep4
## 321 452 Fold06.Rep4
## 322 468 Fold06.Rep4
## 323 427 Fold06.Rep4
## 324 463 Fold06.Rep4
## 325 423 Fold07.Rep4
## 326 346 Fold07.Rep4
## 327 412 Fold07.Rep4
## 328 447 Fold07.Rep4
## 329 417 Fold07.Rep4
## 330 439 Fold07.Rep4
## 331 461 Fold07.Rep4
## 332 428 Fold07.Rep4
## 333 451 Fold07.Rep4
## 334 396 Fold08.Rep4
## 335 303 Fold08.Rep4
## 336 380 Fold08.Rep4
## 337 443 Fold08.Rep4
## 338 390 Fold08.Rep4
## 339 409 Fold08.Rep4
## 340 452 Fold08.Rep4
## 341 400 Fold08.Rep4
## 342 438 Fold08.Rep4
## 343 420 Fold09.Rep4
## 344 346 Fold09.Rep4
## 345 410 Fold09.Rep4
## 346 441 Fold09.Rep4
## 347 409 Fold09.Rep4
## 348 423 Fold09.Rep4
## 349 457 Fold09.Rep4
## 350 416 Fold09.Rep4
## 351 441 Fold09.Rep4
## 352 401 Fold10.Rep4
## 353 332 Fold10.Rep4
## 354 387 Fold10.Rep4
## 355 438 Fold10.Rep4
## 356 391 Fold10.Rep4
## 357 418 Fold10.Rep4
## 358 462 Fold10.Rep4
## 359 401 Fold10.Rep4
## 360 440 Fold10.Rep4
## 361 414 Fold01.Rep5
## 362 333 Fold01.Rep5
## 363 400 Fold01.Rep5
## 364 438 Fold01.Rep5
## 365 402 Fold01.Rep5
## 366 421 Fold01.Rep5
## 367 465 Fold01.Rep5
## 368 409 Fold01.Rep5
## 369 439 Fold01.Rep5
## 370 425 Fold02.Rep5
## 371 344 Fold02.Rep5
## 372 418 Fold02.Rep5
## 373 459 Fold02.Rep5
## 374 430 Fold02.Rep5
## 375 448 Fold02.Rep5
## 376 480 Fold02.Rep5
## 377 439 Fold02.Rep5
## 378 464 Fold02.Rep5
## 379 404 Fold03.Rep5
## 380 319 Fold03.Rep5
## 381 389 Fold03.Rep5
## 382 433 Fold03.Rep5
## 383 390 Fold03.Rep5
## 384 415 Fold03.Rep5
## 385 458 Fold03.Rep5
## 386 407 Fold03.Rep5
## 387 445 Fold03.Rep5
## 388 419 Fold04.Rep5
## 389 330 Fold04.Rep5
## 390 405 Fold04.Rep5
## 391 448 Fold04.Rep5
## 392 408 Fold04.Rep5
## 393 433 Fold04.Rep5
## 394 468 Fold04.Rep5
## 395 420 Fold04.Rep5
## 396 452 Fold04.Rep5
## 397 393 Fold05.Rep5
## 398 321 Fold05.Rep5
## 399 378 Fold05.Rep5
## 400 420 Fold05.Rep5
## 401 390 Fold05.Rep5
## 402 410 Fold05.Rep5
## 403 440 Fold05.Rep5
## 404 396 Fold05.Rep5
## 405 424 Fold05.Rep5
## 406 386 Fold06.Rep5
## 407 302 Fold06.Rep5
## 408 372 Fold06.Rep5
## 409 425 Fold06.Rep5
## 410 382 Fold06.Rep5
## 411 400 Fold06.Rep5
## 412 442 Fold06.Rep5
## 413 384 Fold06.Rep5
## 414 431 Fold06.Rep5
## 415 432 Fold07.Rep5
## 416 350 Fold07.Rep5
## 417 425 Fold07.Rep5
## 418 463 Fold07.Rep5
## 419 426 Fold07.Rep5
## 420 443 Fold07.Rep5
## 421 480 Fold07.Rep5
## 422 432 Fold07.Rep5
## 423 464 Fold07.Rep5
## 424 407 Fold08.Rep5
## 425 328 Fold08.Rep5
## 426 385 Fold08.Rep5
## 427 444 Fold08.Rep5
## 428 397 Fold08.Rep5
## 429 431 Fold08.Rep5
## 430 470 Fold08.Rep5
## 431 410 Fold08.Rep5
## 432 451 Fold08.Rep5
## 433 405 Fold09.Rep5
## 434 330 Fold09.Rep5
## 435 388 Fold09.Rep5
## 436 443 Fold09.Rep5
## 437 397 Fold09.Rep5
## 438 421 Fold09.Rep5
## 439 458 Fold09.Rep5
## 440 406 Fold09.Rep5
## 441 444 Fold09.Rep5
## 442 439 Fold10.Rep5
## 443 356 Fold10.Rep5
## 444 427 Fold10.Rep5
## 445 472 Fold10.Rep5
## 446 425 Fold10.Rep5
## 447 457 Fold10.Rep5
## 448 483 Fold10.Rep5
## 449 439 Fold10.Rep5
## 450 474 Fold10.Rep5
boostingtrain$perfNames
## [1] "Accuracy" "Kappa"
#Optimal model
# boostingoptimal <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 150,
# interaction.depth = 3, shrinkage = 0.1)
# summary(boostingoptimal)
# varImp(boostingoptimal, numTrees = 150)
#Test error of the optimal model
# testerroroptimal <- c()
# thresh <- 0.5
# for(i in 1:15){
# yhat <- predict(boostingtrain, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "prob")
# yhat <- (yhat > thresh)
# testerroroptimal[i] <- mean(yhat != combined[32403:48598, 44])
# }
# plot(testerroroptimal)
#ROC curve - testing
set.seed(100)
posopt <- c()
posopt <- predict(boostingtrain, newdata = combined[32403:48598, -44], n.trees = 150, type = "prob")
predictsopt <- prediction(posopt[, 2], combined[32403:48598, 44])
rocopt <- ROCR::performance(predictsopt, measure = "tpr", x.measure = "fpr")
plot(rocopt)
abline(0, 1, col = "red")
aucopt <- ROCR::performance(predictsopt, measure = "auc")
aucopt@y.values
## [[1]]
## [1] 0.9177315
#ROC and AUC combined testing
plot(roc1, type = "l", col = "red")
par(new = TRUE)
plot(roc2, type = "l", col = "green")
par(new = TRUE)
plot(roc3, type = "l", col = "blue")
par(new = TRUE)
plot(roc4, type = "l", col = "black")
par(new = TRUE)
plot(roc5, type = "l", col = "yellow")
par(new = TRUE)
plot(rocopt, type = "l", col = "purple",
main = "1: red, 2: green, 3: blue, 4: black, 5: yellow, trained: purple")
#Train error of the optimal model
# trainerroropt <- c()
# thresh <- 0.5
# for(i in 1:500){
# yhat <- predict(boostingoptimal, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
# yhat <- (yhat > thresh)
# trainerroropt[i] <- mean(yhat != combined[1:32402, 44])
# }
# plot(trainerroropt)
#ROC curve - training
pos5opt <- c()
pos5opt <- predict(boostingtrain, newdata = combined[1:32402, -44], n.trees = 150, type = "prob")
predicts5opt <- prediction(pos5opt[, 2], combined[1:32402, 44])
roc5opt <- ROCR::performance(predicts5opt, measure = "tpr", x.measure = "fpr")
plot(roc5opt)
abline(0, 1, col = "red")
auc5opt <- ROCR::performance(predicts5opt, measure = "auc")
auc5opt@y.values
## [[1]]
## [1] 0.9215941
# boosting <- C50::C5.0(newtrain2[, -45], newtrain2[, 45], trials = 10) #boosting iteration = 10
# summary(boosting)
#
# classes <- predict(boosting, newtest2[, -45], type = "class")
# table(classes, newtest2[, 45])
#
# acc <- sum(classes == newtest2[, 45]) / length(newtest2[, 45])
# acc
# https://github.com/topepo/caret/blob/master/RegressionTests/Code/C5.0.R
#
# cctrl1 <- trainControl(method = "cv", number = 3, returnResamp = "all",
# classProbs = TRUE,
# summaryFunction = twoClassSummary)
# cctrl2 <- trainControl(method = "LOOCV",
# classProbs = TRUE, summaryFunction = twoClassSummary)
# cctrl3 <- trainControl(method = "none",
# classProbs = TRUE, summaryFunction = twoClassSummary)
# cctrlR <- trainControl(method = "cv", number = 3, returnResamp = "all",
# classProbs = TRUE,
# search = "random")
#
# y <- as.numeric(newtrain2$income) - 1
# test_class_cv_model <- train(newtrain2[, -45], y,
# method = "C5.0",
# trControl = cctrl1,
# metric = "ROC",
# control = C50::C5.0Control(seed = 1),
# preProc = c("center", "scale"))
\(\\\)
\(\\\)
We commented out, since it takes extremly long to run this chunk…
\(\\\)
\(\\\)
set.seed(100)
thresh <- 0.5
a <- predict(boosting1, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
a1 <- (a > thresh)
a2 <- mean(a1 == combined[32403:48598, 44])
b <- predict(boosting2, newdata = combined[32403:48598, -44], n.trees = 2000, type = "response")
b1 <- (b > 0.3)
b2 <- mean(b1 == combined[32403:48598, 44])
c <- predict(boosting3, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
c1 <- (c > thresh)
c2 <- mean(c1 == combined[32403:48598, 44])
d <- predict(boosting4, newdata = combined[32403:48598, -44], n.trees = 200, type = "response")
d1 <- (d > thresh)
d2 <- mean(d1 == combined[32403:48598, 44])
e <- predict(boosting5, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
e1 <- (e > thresh)
e2 <- mean(e1 == combined[32403:48598, 44])
f <- predict(boostingtrain, newdata = combined[32403:48598, -44], n.trees = 150, type = "raw")
f1 <- as.numeric(f) - 1
f2 <- mean(f1 == combined[32403:48598, 44])
a2
## [1] 0.8631761
b2
## [1] 0.835021
c2
## [1] 0.857187
d2
## [1] 0.8700296
e2
## [1] 0.8710175
f2
## [1] 0.8670042
\(\\\)
\(\\\)
final.auc4 <- boosting5
final.thres4 <- boosting4
Comment:
The model with the highest AUC and the highest testing set accuracy rate are the same!
set.seed(100)
newtrain2 <- read.csv("../data/cleandata/newtrain2.csv", header = T)
newtest2 <- read.csv("../data/cleandata/newtest2.csv", header = T)
#Change to binary digit
combined <- rbind(newtrain2, newtest2)
combined$income <- as.numeric(combined$income) - 1
We picked the best classifiers from classification tree, bagged tree, and random forest by using AUC and the accuracy rate from the best threshold for training dataset.
set.seed(100)
#from classification
final.auc1
## n= 32402
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 32402 7682 <=50K (0.762915869 0.237084131)
## 2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)
## 4) capital.gain< 7073.5 17274 849 <=50K (0.950850990 0.049149010)
## 8) education.num< 12.5 13864 342 <=50K (0.975331795 0.024668205)
## 16) capital.loss< 2218.5 13807 315 <=50K (0.977185486 0.022814514) *
## 17) capital.loss>=2218.5 57 27 <=50K (0.526315789 0.473684211)
## 34) capital.loss>=3343.5 8 0 <=50K (1.000000000 0.000000000) *
## 35) capital.loss< 3343.5 49 22 >50K (0.448979592 0.551020408) *
## 9) education.num>=12.5 3410 507 <=50K (0.851319648 0.148680352) *
## 5) capital.gain>=7073.5 284 11 >50K (0.038732394 0.961267606)
## 10) capital.gain>=30961.5 5 0 <=50K (1.000000000 0.000000000) *
## 11) capital.gain< 30961.5 279 6 >50K (0.021505376 0.978494624)
## 22) capital.gain< 8296 19 6 >50K (0.315789474 0.684210526)
## 44) education.num< 11.5 8 2 <=50K (0.750000000 0.250000000) *
## 45) education.num>=11.5 11 0 >50K (0.000000000 1.000000000) *
## 23) capital.gain>=8296 260 0 >50K (0.000000000 1.000000000) *
## 3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)
## 6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)
## 12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)
## 24) education.num< 8.5 1656 167 <=50K (0.899154589 0.100845411) *
## 25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)
## 50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341) *
## 51) capital.loss>=1782.5 335 83 >50K (0.247761194 0.752238806) *
## 13) capital.gain>=5095.5 496 11 >50K (0.022177419 0.977822581)
## 26) capital.gain>=21045.5 2 0 <=50K (1.000000000 0.000000000) *
## 27) capital.gain< 21045.5 494 9 >50K (0.018218623 0.981781377) *
## 7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)
## 14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)
## 28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)
## 56) hours.per.week< 31 306 112 <=50K (0.633986928 0.366013072) *
## 57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345) *
## 29) capital.loss>=1782.5 398 13 >50K (0.032663317 0.967336683) *
## 15) capital.gain>=5095.5 581 3 >50K (0.005163511 0.994836489) *
#Getting predicted >50K of income probabilities
tree_prob <- predict(final.auc1, newdata = newtest2, type = "prob")[, 2]
tree_prediction <- prediction(tree_prob, newtest2$income)
tree_performance <- ROCR::performance(tree_prediction, measure = "tpr", x.measure = "fpr")
#Plot ROC curve
plot(tree_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
tree.auc <- ROCR::performance(tree_prediction, measure="auc")@y.values[[1]]
tree.auc
## [1] 0.8768653
#==============================================================
#from bagged tree
final.auc2
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L)
## Type of random forest: classification
## Number of trees: 68
## No. of variables tried at each split: 43
##
## OOB estimate of error rate: 13.66%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 22960 1760 0.07119741
## >50K 2666 5016 0.34704504
#Getting predicted >50K of income probabilities
tunned.bag.rf_prob <- predict(final.auc2, newdata = newtest2,
type = "prob")[, 2]
tunned.bag.rf_prediction <- prediction(tunned.bag.rf_prob, newtest2$income)
tunned.bag.rf_performance <- ROCR::performance(tunned.bag.rf_prediction,
measure = "tpr",
x.measure = "fpr")
#Plot ROC curve
plot(tunned.bag.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
tunned.bag.rf.auc <- ROCR::performance(tunned.bag.rf_prediction,
measure = "auc")@y.values[[1]]
tunned.bag.rf.auc
## [1] 0.8942506
#==============================================================
#from random forest
final.auc3
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 79L, importance = TRUE, mtry = 8L, nodesize = 14L)
## Type of random forest: classification
## Number of trees: 79
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 13.47%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 23063 1657 0.06703074
## >50K 2709 4973 0.35264254
#Getting predicted >50K of income probabilities
tunned.rf_prob <- predict(final.auc3, newdata = newtest2,
type = "prob")[, 2]
tunned.rf_prediction <- prediction(tunned.rf_prob, newtest2$income)
tunned.rf_performance <- ROCR::performance(tunned.rf_prediction, measure = "tpr", x.measure = "fpr")
#Plot ROC curve
plot(tunned.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)
#Calculate AUC
tunned.rf.auc <- ROCR::performance(tunned.rf_prediction,
measure = "auc")@y.values[[1]]
tunned.rf.auc
## [1] 0.8962369
#==============================================================
#from boosted
final.auc4
## gbm(formula = income ~ ., distribution = "bernoulli", data = combined[1:32402,
## ], n.trees = 5000, interaction.depth = 3, shrinkage = 0.1)
## A gradient boosted model with bernoulli loss function.
## 5000 iterations were performed.
## There were 43 predictors of which 42 had non-zero influence.
#ROC curve - testing
pos5 <- c()
pos5 <- predict(final.auc4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
predicts5 <- prediction(pos5, combined[32403:48598, 44])
roc5 <- ROCR::performance(predicts5, measure = "tpr", x.measure = "fpr")
plot(roc5, main = "ROC curve")
abline(0, 1, col = "red")
auc5 <- ROCR::performance(predicts5, measure = "auc")
auc5@y.values
## [[1]]
## [1] 0.9231948
\(\\\)
\(\\\)
set.seed(100)
tree_class <- predict(final.auc1, newdata = newtest2, type = "class")
confusionMatrix(tree_class, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11859 1825
## >50K 576 1936
##
## Accuracy : 0.8518
## 95% CI : (0.8462, 0.8572)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5298
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9537
## Specificity : 0.5148
## Pos Pred Value : 0.8666
## Neg Pred Value : 0.7707
## Prevalence : 0.7678
## Detection Rate : 0.7322
## Detection Prevalence : 0.8449
## Balanced Accuracy : 0.7342
##
## 'Positive' Class : <=50K
##
#==============================================================
tunned.bag.rf_class <- predict(final.auc2, newdata = newtest2,
type = "class")
confusionMatrix(tunned.bag.rf_class, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11504 1340
## >50K 931 2421
##
## Accuracy : 0.8598
## 95% CI : (0.8543, 0.8651)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5913
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9251
## Specificity : 0.6437
## Pos Pred Value : 0.8957
## Neg Pred Value : 0.7223
## Prevalence : 0.7678
## Detection Rate : 0.7103
## Detection Prevalence : 0.7930
## Balanced Accuracy : 0.7844
##
## 'Positive' Class : <=50K
##
#==============================================================
tunned.rf_class <- predict(final.auc3, newdata = newtest2,
type = "class")
confusionMatrix(tunned.rf_class, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11601 1362
## >50K 834 2399
##
## Accuracy : 0.8644
## 95% CI : (0.859, 0.8696)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6002
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9329
## Specificity : 0.6379
## Pos Pred Value : 0.8949
## Neg Pred Value : 0.7420
## Prevalence : 0.7678
## Detection Rate : 0.7163
## Detection Prevalence : 0.8004
## Balanced Accuracy : 0.7854
##
## 'Positive' Class : <=50K
##
#==============================================================
boosted_class <- predict.gbm(final.auc4,
newdata = combined[32403:48598, -44],
n.trees = 800, type = "response")
boosted_class <- ifelse(boosted_class > 0.5, ">50K", "<=50K")
confusionMatrix(boosted_class, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11703 1357
## >50K 732 2404
##
## Accuracy : 0.871
## 95% CI : (0.8658, 0.8761)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.616
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9411
## Specificity : 0.6392
## Pos Pred Value : 0.8961
## Neg Pred Value : 0.7666
## Prevalence : 0.7678
## Detection Rate : 0.7226
## Detection Prevalence : 0.8064
## Balanced Accuracy : 0.7902
##
## 'Positive' Class : <=50K
##
\(\\\)
\(\\\)
set.seed(100)
#Plot ROC curve
plot(tree_performance, main="ROC curve", col = "blue") # classification
plot(tunned.bag.rf_performance, add = T, col = "red") # bagged
plot(tunned.rf_performance, add = T, col = "green") # random forest
plot(roc5, add = T) # boosted
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Classification", "Bagged",
"Boosted","Random Forest"),
col=c("blue", "red", "black", "green"), lwd=3, cex=.5, horiz = TRUE)
AUC.final <- data.frame(tree.auc, tunned.bag.rf.auc, tunned.rf.auc,
boosted.auc = auc5@y.values[[1]])
AUC.final[, order(AUC.final)]
## tree.auc tunned.bag.rf.auc tunned.rf.auc boosted.auc
## 1 0.8768653 0.8942506 0.8962369 0.9231948
\(\\\)
\(\\\)
set.seed(100)
#from classification
final.auc1
## n= 32402
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 32402 7682 <=50K (0.762915869 0.237084131)
## 2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)
## 4) capital.gain< 7073.5 17274 849 <=50K (0.950850990 0.049149010)
## 8) education.num< 12.5 13864 342 <=50K (0.975331795 0.024668205)
## 16) capital.loss< 2218.5 13807 315 <=50K (0.977185486 0.022814514) *
## 17) capital.loss>=2218.5 57 27 <=50K (0.526315789 0.473684211)
## 34) capital.loss>=3343.5 8 0 <=50K (1.000000000 0.000000000) *
## 35) capital.loss< 3343.5 49 22 >50K (0.448979592 0.551020408) *
## 9) education.num>=12.5 3410 507 <=50K (0.851319648 0.148680352) *
## 5) capital.gain>=7073.5 284 11 >50K (0.038732394 0.961267606)
## 10) capital.gain>=30961.5 5 0 <=50K (1.000000000 0.000000000) *
## 11) capital.gain< 30961.5 279 6 >50K (0.021505376 0.978494624)
## 22) capital.gain< 8296 19 6 >50K (0.315789474 0.684210526)
## 44) education.num< 11.5 8 2 <=50K (0.750000000 0.250000000) *
## 45) education.num>=11.5 11 0 >50K (0.000000000 1.000000000) *
## 23) capital.gain>=8296 260 0 >50K (0.000000000 1.000000000) *
## 3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)
## 6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)
## 12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)
## 24) education.num< 8.5 1656 167 <=50K (0.899154589 0.100845411) *
## 25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)
## 50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341) *
## 51) capital.loss>=1782.5 335 83 >50K (0.247761194 0.752238806) *
## 13) capital.gain>=5095.5 496 11 >50K (0.022177419 0.977822581)
## 26) capital.gain>=21045.5 2 0 <=50K (1.000000000 0.000000000) *
## 27) capital.gain< 21045.5 494 9 >50K (0.018218623 0.981781377) *
## 7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)
## 14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)
## 28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)
## 56) hours.per.week< 31 306 112 <=50K (0.633986928 0.366013072) *
## 57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345) *
## 29) capital.loss>=1782.5 398 13 >50K (0.032663317 0.967336683) *
## 15) capital.gain>=5095.5 581 3 >50K (0.005163511 0.994836489) *
#Getting predicted >50K of income probabilities
tree_prob <- predict(final.auc1, newdata = newtest2, type = "prob")[, 2]
tree_prediction <- prediction(tree_prob, newtest2$income)
tree_performance <- ROCR::performance(tree_prediction, measure = "tpr", x.measure = "tnr")
#Plot ROC curve
plot(tree_performance, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)
#==============================================================
#from bagged tree
final.auc2
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L)
## Type of random forest: classification
## Number of trees: 68
## No. of variables tried at each split: 43
##
## OOB estimate of error rate: 13.66%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 22960 1760 0.07119741
## >50K 2666 5016 0.34704504
#Getting predicted >50K of income probabilities
tunned.bag.rf_prob <- predict(final.auc2, newdata = newtest2,
type = "prob")[, 2]
tunned.bag.rf_prediction <- prediction(tunned.bag.rf_prob, newtest2$income)
tunned.bag.rf_performance <- ROCR::performance(tunned.bag.rf_prediction,
measure = "tpr",
x.measure = "tnr")
#Plot ROC curve
plot(tunned.bag.rf_performance, main="TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)
#==============================================================
#from random forest
final.auc3
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 79L, importance = TRUE, mtry = 8L, nodesize = 14L)
## Type of random forest: classification
## Number of trees: 79
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 13.47%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 23063 1657 0.06703074
## >50K 2709 4973 0.35264254
#Getting predicted >50K of income probabilities
tunned.rf_prob <- predict(final.auc3, newdata = newtest2,
type = "prob")[, 2]
tunned.rf_prediction <- prediction(tunned.rf_prob, newtest2$income)
tunned.rf_performance <- ROCR::performance(tunned.rf_prediction, measure = "tpr", x.measure = "tnr")
#Plot ROC curve
plot(tunned.rf_performance, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)
#==============================================================
#from boosted
final.auc4
## gbm(formula = income ~ ., distribution = "bernoulli", data = combined[1:32402,
## ], n.trees = 5000, interaction.depth = 3, shrinkage = 0.1)
## A gradient boosted model with bernoulli loss function.
## 5000 iterations were performed.
## There were 43 predictors of which 42 had non-zero influence.
#ROC curve - testing
pos5 <- c()
pos5 <- predict(final.auc4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
predicts5 <- prediction(pos5, combined[32403:48598, 44])
roc5 <- ROCR::performance(predicts5, measure = "tpr", x.measure = "tnr")
plot(roc5, main="TPR v.s. TNR")
abline(a = 1, b = -1, col = "red")
plot(tree_performance, main = "TPR v.s. TNR - AUC selection", col = "blue")
plot(tunned.bag.rf_performance, col = "red", add = TRUE)
plot(tunned.rf_performance, col = "green", add = TRUE)
plot(roc5, add = TRUE)
abline(a = 1, b = -1, lty = 2)
legend("bottomleft", legend = c("Classification", "Bagged",
"Boosted","Random Forest"),
col=c("blue", "red", "black", "green"), lwd=3, cex=.5, horiz = TRUE)
\(\\\)
\(\\\)
set.seed(100)
#from classification
#final.thres1
info_prob <- predict(final.thres1.half, newdata = newtest2, type = "prob")[, 2]
#Test accuracy rate by using default cutoff 0.5
prunned.info.accuracy <- mean((info_prob > 0.5) == (newtest2$income == ">50K"))
cat("Accuracy classification : ", prunned.info.accuracy, "\n")
## Accuracy classification : 0.8608916
#==============================================================
#from bagged tree
#final.thres2 # bag.rforest$learner.model
tunned.bag.rf_prob <- predict(final.thres2.half, newdata = newtest2,
type = "prob")[, 2]
#Test accuracy rate by using default cutoff 0.5
tunned.bagged.accuracy <- mean((tunned.bag.rf_prob > 0.5) == (newtest2$income == ">50K"))
cat("Accuracy Bagged : ", tunned.bagged.accuracy, "\n")
## Accuracy Bagged : 0.8620647
#==============================================================
#from random forest
#final.thres3 # untunned.forest$learner.model
untunned.rf_prob <- predict(final.thres3.half, newdata = newtest2,
type = "prob")[, 2]
#Test accuracy rate by using default cutoff 0.5
rf.untunned.accuracy <- mean((untunned.rf_prob > 0.5) == (newtest2$income == ">50K"))
cat("Accuracy Random Forest : ", rf.untunned.accuracy, "\n")
## Accuracy Random Forest : 0.866078
#==============================================================
#from boosting
#final.thres4
e <- predict(final.thres4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
e1 <- (e > 0.5)
e2 <- mean(e1 == combined[32403:48598, 44])
cat("Accuracy Boosted : ", e2, "\n")
## Accuracy Boosted : 0.8695357
Comment:
So our best classifier from accuracy criterion is Boosted tree.
\(\\\)
\(\\\)
set.seed(100)
classification_class2 <- predict(final.thres1.half$finalModel, newdata = newtest2, type = "class")
confusionMatrix(classification_class2, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11767 1585
## >50K 668 2176
##
## Accuracy : 0.8609
## 95% CI : (0.8555, 0.8662)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5736
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9463
## Specificity : 0.5786
## Pos Pred Value : 0.8813
## Neg Pred Value : 0.7651
## Prevalence : 0.7678
## Detection Rate : 0.7265
## Detection Prevalence : 0.8244
## Balanced Accuracy : 0.7624
##
## 'Positive' Class : <=50K
##
#==============================================================
tunned.bag.rf_class2 <- predict(final.thres2.half, newdata = newtest2,
type = "class")
confusionMatrix(tunned.bag.rf_class2, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11504 1340
## >50K 931 2421
##
## Accuracy : 0.8598
## 95% CI : (0.8543, 0.8651)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.5913
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9251
## Specificity : 0.6437
## Pos Pred Value : 0.8957
## Neg Pred Value : 0.7223
## Prevalence : 0.7678
## Detection Rate : 0.7103
## Detection Prevalence : 0.7930
## Balanced Accuracy : 0.7844
##
## 'Positive' Class : <=50K
##
#==============================================================
untunned.rf_class2 <- predict(final.thres3.half, newdata = newtest2,
type = "class")
confusionMatrix(untunned.rf_class2, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11598 1359
## >50K 837 2402
##
## Accuracy : 0.8644
## 95% CI : (0.859, 0.8696)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6004
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9327
## Specificity : 0.6387
## Pos Pred Value : 0.8951
## Neg Pred Value : 0.7416
## Prevalence : 0.7678
## Detection Rate : 0.7161
## Detection Prevalence : 0.8000
## Balanced Accuracy : 0.7857
##
## 'Positive' Class : <=50K
##
#==============================================================
boosted_class2 <- predict(final.thres4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
boosted_class2 <- ifelse(boosted_class2 > 0.5, ">50K", "<=50K")
confusionMatrix(boosted_class2, newtest2$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11667 1345
## >50K 768 2416
##
## Accuracy : 0.8695
## 95% CI : (0.8643, 0.8747)
## No Information Rate : 0.7678
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.6134
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9382
## Specificity : 0.6424
## Pos Pred Value : 0.8966
## Neg Pred Value : 0.7588
## Prevalence : 0.7678
## Detection Rate : 0.7204
## Detection Prevalence : 0.8034
## Balanced Accuracy : 0.7903
##
## 'Positive' Class : <=50K
##
\(\\\)
\(\\\)
set.seed(100)
#from classification
final.thres1.half
## CART
##
## 32402 samples
## 43 predictor
## 2 classes: '<=50K', '>50K'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.001171570 0.8576940 0.5683314
## 0.001366832 0.8572001 0.5639238
## 0.002212965 0.8547208 0.5507302
## 0.002629524 0.8536818 0.5444061
## 0.003558101 0.8501429 0.5352532
## 0.006769071 0.8442687 0.5163148
## 0.010999740 0.8432194 0.5114075
## 0.034105702 0.8385798 0.4924548
## 0.061702682 0.8264613 0.4422631
## 0.120997136 0.7879256 0.1876654
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.00117157.
#Getting predicted >50K of income probabilities
tree_prob2 <- predict(final.thres1.half, newdata = newtest2,
type = "prob")[, 2]
tree_prediction2 <- prediction(tree_prob2, newtest2$income)
tree_performance2 <- ROCR::performance(tree_prediction2,
measure = "tpr", x.measure = "tnr")
#Plot ROC curve
plot(tree_performance2, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)
#==============================================================
#from bagged tree
final.thres2.half
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L)
## Type of random forest: classification
## Number of trees: 68
## No. of variables tried at each split: 43
##
## OOB estimate of error rate: 13.66%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 22960 1760 0.07119741
## >50K 2666 5016 0.34704504
#Getting predicted >50K of income probabilities
tunned.bag.rf_prob2 <- predict(final.thres2.half, newdata = newtest2,
type = "prob")[, 2]
tunned.bag.rf_prediction2 <- prediction(tunned.bag.rf_prob2, newtest2$income)
tunned.bag.rf_performance2 <- ROCR::performance(tunned.bag.rf_prediction2,
measure = "tpr",
x.measure = "tnr")
#Plot ROC curve
plot(tunned.bag.rf_performance2, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)
#==============================================================
#from random forest
final.thres3.half
##
## Call:
## randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff, ntree = 50L, importance = TRUE)
## Type of random forest: classification
## Number of trees: 50
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 13.57%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 23021 1699 0.06872977
## >50K 2698 4984 0.35121062
#Getting predicted >50K of income probabilities
untunned.rf_prob3 <- predict(final.thres3.half, newdata = newtest2,
type = "prob")[, 2]
untunned.rf_prediction3 <- prediction(untunned.rf_prob3, newtest2$income)
untunned.rf_performance3 <- ROCR::performance(untunned.rf_prediction3,
measure = "tpr", x.measure = "tnr")
#Plot ROC curve
plot(untunned.rf_performance3, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)
#==============================================================
#from boosted
final.thres4
## gbm(formula = income ~ ., distribution = "bernoulli", data = combined[1:32402,
## ], n.trees = 5000, interaction.depth = 3, shrinkage = 0.2)
## A gradient boosted model with bernoulli loss function.
## 5000 iterations were performed.
## There were 43 predictors of which 42 had non-zero influence.
#ROC curve - testing
pos5b <- c()
pos5b <- predict(final.thres4, newdata = combined[32403:48598, -44], n.trees = 800,
type = "response")
predicts5b <- prediction(pos5b, combined[32403:48598, 44])
roc5b <- ROCR::performance(predicts5b, measure = "tpr", x.measure = "tnr")
plot(roc5b, main = "TPR v.s. TNR")
abline(a = 1, b = -1, col = "red")
#Combine into one graph
plot(tree_performance2, main = "TPR v.s. TNR - Accuracy selection",
col = "blue")
plot(tunned.bag.rf_performance2, col = "red", add = TRUE)
plot(untunned.rf_performance3, col = "green", add = TRUE)
plot(roc5b, add = TRUE)
abline(a = 1, b = -1, lty = 2)
legend("bottomleft", legend = c("Classification", "Bagged",
"Boosted","Random Forest"),
col=c("blue", "red", "black", "green"), lwd=3, cex=.5, horiz = TRUE)